{ "best_metric": 0.22805513441562653, "best_model_checkpoint": "models/llama3.2-3b-dpo-mini-reasoning/checkpoint-10000", "epoch": 0.9999801520354088, "eval_steps": 1000, "global_step": 25191, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.919936336585703e-05, "grad_norm": 5.471509939860072, "learning_rate": 3.457814661134163e-10, "logits/chosen": -0.96875, "logits/rejected": -0.5, "logps/chosen": -152.0, "logps/rejected": -187.0, "loss": 1.3828, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0006919936336585703, "grad_norm": 4.672813363681262, "learning_rate": 3.457814661134163e-09, "logits/chosen": -0.96484375, "logits/rejected": -0.310546875, "logps/chosen": -168.0, "logps/rejected": -156.0, "loss": 1.3841, "rewards/accuracies": 0.2083333283662796, "rewards/chosen": -1.7404556274414062e-05, "rewards/margins": 0.00015735626220703125, "rewards/rejected": -0.0001735687255859375, "step": 10 }, { "epoch": 0.0013839872673171406, "grad_norm": 4.895803752014187, "learning_rate": 6.915629322268326e-09, "logits/chosen": -1.0625, "logits/rejected": -0.4453125, "logps/chosen": -171.0, "logps/rejected": -151.0, "loss": 1.3846, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.0002346038818359375, "rewards/margins": -0.0001888275146484375, "rewards/rejected": -4.7206878662109375e-05, "step": 20 }, { "epoch": 0.002075980900975711, "grad_norm": 5.118897555503357, "learning_rate": 1.037344398340249e-08, "logits/chosen": -0.90625, "logits/rejected": -0.392578125, "logps/chosen": -167.0, "logps/rejected": -147.0, "loss": 1.3847, "rewards/accuracies": 0.2874999940395355, "rewards/chosen": 0.0002346038818359375, "rewards/margins": 0.0005340576171875, "rewards/rejected": -0.00029754638671875, "step": 30 }, { "epoch": 0.002767974534634281, "grad_norm": 6.015233424364166, "learning_rate": 1.3831258644536651e-08, "logits/chosen": -0.87890625, "logits/rejected": -0.357421875, "logps/chosen": -184.0, "logps/rejected": -173.0, "loss": 1.3849, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": -0.0004673004150390625, "rewards/margins": 0.0007781982421875, "rewards/rejected": -0.001251220703125, "step": 40 }, { "epoch": 0.0034599681682928518, "grad_norm": 4.496643627251894, "learning_rate": 1.7289073305670815e-08, "logits/chosen": -0.859375, "logits/rejected": -0.37890625, "logps/chosen": -191.0, "logps/rejected": -168.0, "loss": 1.3843, "rewards/accuracies": 0.2750000059604645, "rewards/chosen": 0.0001087188720703125, "rewards/margins": -0.000141143798828125, "rewards/rejected": 0.0002498626708984375, "step": 50 }, { "epoch": 0.004151961801951422, "grad_norm": 4.457424453514859, "learning_rate": 2.074688796680498e-08, "logits/chosen": -0.77734375, "logits/rejected": -0.287109375, "logps/chosen": -179.0, "logps/rejected": -159.0, "loss": 1.385, "rewards/accuracies": 0.26249998807907104, "rewards/chosen": -0.00023365020751953125, "rewards/margins": -0.00096893310546875, "rewards/rejected": 0.000736236572265625, "step": 60 }, { "epoch": 0.004843955435609992, "grad_norm": 4.895741139526374, "learning_rate": 2.4204702627939142e-08, "logits/chosen": -0.9296875, "logits/rejected": -0.396484375, "logps/chosen": -191.0, "logps/rejected": -177.0, "loss": 1.3851, "rewards/accuracies": 0.26875001192092896, "rewards/chosen": -0.000186920166015625, "rewards/margins": 0.0002498626708984375, "rewards/rejected": -0.000438690185546875, "step": 70 }, { "epoch": 0.005535949069268562, "grad_norm": 5.02915608588916, "learning_rate": 2.7662517289073302e-08, "logits/chosen": -0.796875, "logits/rejected": -0.3984375, "logps/chosen": -169.0, "logps/rejected": -179.0, "loss": 1.3851, "rewards/accuracies": 0.29374998807907104, "rewards/chosen": -0.000736236572265625, "rewards/margins": -6.198883056640625e-05, "rewards/rejected": -0.00067138671875, "step": 80 }, { "epoch": 0.006227942702927133, "grad_norm": 5.246219720037401, "learning_rate": 3.112033195020747e-08, "logits/chosen": -0.87109375, "logits/rejected": -0.220703125, "logps/chosen": -187.0, "logps/rejected": -154.0, "loss": 1.3841, "rewards/accuracies": 0.34375, "rewards/chosen": 0.00151824951171875, "rewards/margins": 0.001068115234375, "rewards/rejected": 0.000453948974609375, "step": 90 }, { "epoch": 0.0069199363365857035, "grad_norm": 5.054441504168726, "learning_rate": 3.457814661134163e-08, "logits/chosen": -0.94140625, "logits/rejected": -0.314453125, "logps/chosen": -187.0, "logps/rejected": -174.0, "loss": 1.3838, "rewards/accuracies": 0.3375000059604645, "rewards/chosen": 0.0023040771484375, "rewards/margins": 0.0012664794921875, "rewards/rejected": 0.00102996826171875, "step": 100 }, { "epoch": 0.007611929970244274, "grad_norm": 4.939613678738369, "learning_rate": 3.803596127247579e-08, "logits/chosen": -0.92578125, "logits/rejected": -0.220703125, "logps/chosen": -200.0, "logps/rejected": -180.0, "loss": 1.3842, "rewards/accuracies": 0.41874998807907104, "rewards/chosen": 0.0030975341796875, "rewards/margins": 0.0021820068359375, "rewards/rejected": 0.00090789794921875, "step": 110 }, { "epoch": 0.008303923603902844, "grad_norm": 18.08435557838418, "learning_rate": 4.149377593360996e-08, "logits/chosen": -0.84375, "logits/rejected": -0.400390625, "logps/chosen": -181.0, "logps/rejected": -176.0, "loss": 1.3828, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.004913330078125, "rewards/margins": 0.0028076171875, "rewards/rejected": 0.002105712890625, "step": 120 }, { "epoch": 0.008995917237561414, "grad_norm": 4.880346150845241, "learning_rate": 4.495159059474412e-08, "logits/chosen": -1.0390625, "logits/rejected": -0.44921875, "logps/chosen": -183.0, "logps/rejected": -172.0, "loss": 1.3824, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0062255859375, "rewards/margins": 0.0032501220703125, "rewards/rejected": 0.00299072265625, "step": 130 }, { "epoch": 0.009687910871219984, "grad_norm": 5.07244111694318, "learning_rate": 4.8409405255878284e-08, "logits/chosen": -0.90625, "logits/rejected": -0.4375, "logps/chosen": -170.0, "logps/rejected": -199.0, "loss": 1.3815, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.007568359375, "rewards/margins": 0.004119873046875, "rewards/rejected": 0.0034332275390625, "step": 140 }, { "epoch": 0.010379904504878554, "grad_norm": 4.8368799612976225, "learning_rate": 5.186721991701245e-08, "logits/chosen": -0.94921875, "logits/rejected": -0.2294921875, "logps/chosen": -176.0, "logps/rejected": -181.0, "loss": 1.3799, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.010009765625, "rewards/margins": 0.006103515625, "rewards/rejected": 0.003936767578125, "step": 150 }, { "epoch": 0.011071898138537125, "grad_norm": 4.424007746160528, "learning_rate": 5.5325034578146605e-08, "logits/chosen": -0.89453125, "logits/rejected": -0.419921875, "logps/chosen": -165.0, "logps/rejected": -167.0, "loss": 1.379, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.01397705078125, "rewards/margins": 0.006744384765625, "rewards/rejected": 0.007232666015625, "step": 160 }, { "epoch": 0.011763891772195697, "grad_norm": 4.549370841854936, "learning_rate": 5.878284923928077e-08, "logits/chosen": -0.93359375, "logits/rejected": -0.310546875, "logps/chosen": -168.0, "logps/rejected": -162.0, "loss": 1.377, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0189208984375, "rewards/margins": 0.0086669921875, "rewards/rejected": 0.01025390625, "step": 170 }, { "epoch": 0.012455885405854267, "grad_norm": 4.926425716444963, "learning_rate": 6.224066390041494e-08, "logits/chosen": -0.9765625, "logits/rejected": -0.3984375, "logps/chosen": -170.0, "logps/rejected": -143.0, "loss": 1.3749, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0244140625, "rewards/margins": 0.0108642578125, "rewards/rejected": 0.0135498046875, "step": 180 }, { "epoch": 0.013147879039512837, "grad_norm": 4.889161068519296, "learning_rate": 6.56984785615491e-08, "logits/chosen": -0.9765625, "logits/rejected": -0.3984375, "logps/chosen": -169.0, "logps/rejected": -155.0, "loss": 1.3726, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.02783203125, "rewards/margins": 0.01287841796875, "rewards/rejected": 0.01495361328125, "step": 190 }, { "epoch": 0.013839872673171407, "grad_norm": 4.901447989961106, "learning_rate": 6.915629322268326e-08, "logits/chosen": -0.87109375, "logits/rejected": -0.322265625, "logps/chosen": -181.0, "logps/rejected": -179.0, "loss": 1.3692, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.031982421875, "rewards/margins": 0.01611328125, "rewards/rejected": 0.0157470703125, "step": 200 }, { "epoch": 0.014531866306829977, "grad_norm": 4.721721967299699, "learning_rate": 7.261410788381743e-08, "logits/chosen": -0.9609375, "logits/rejected": -0.474609375, "logps/chosen": -179.0, "logps/rejected": -163.0, "loss": 1.3686, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.037841796875, "rewards/margins": 0.01806640625, "rewards/rejected": 0.0196533203125, "step": 210 }, { "epoch": 0.015223859940488547, "grad_norm": 5.214424926108448, "learning_rate": 7.607192254495158e-08, "logits/chosen": -0.734375, "logits/rejected": -0.259765625, "logps/chosen": -167.0, "logps/rejected": -152.0, "loss": 1.3652, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0400390625, "rewards/margins": 0.01953125, "rewards/rejected": 0.0203857421875, "step": 220 }, { "epoch": 0.01591585357414712, "grad_norm": 4.821243422944226, "learning_rate": 7.952973720608575e-08, "logits/chosen": -0.9375, "logits/rejected": -0.287109375, "logps/chosen": -161.0, "logps/rejected": -177.0, "loss": 1.36, "rewards/accuracies": 0.75, "rewards/chosen": 0.052734375, "rewards/margins": 0.028564453125, "rewards/rejected": 0.024169921875, "step": 230 }, { "epoch": 0.016607847207805688, "grad_norm": 4.583097555258543, "learning_rate": 8.298755186721991e-08, "logits/chosen": -0.875, "logits/rejected": -0.26953125, "logps/chosen": -163.0, "logps/rejected": -172.0, "loss": 1.3568, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.06298828125, "rewards/margins": 0.033447265625, "rewards/rejected": 0.029541015625, "step": 240 }, { "epoch": 0.01729984084146426, "grad_norm": 4.781923385528858, "learning_rate": 8.644536652835408e-08, "logits/chosen": -0.828125, "logits/rejected": -0.423828125, "logps/chosen": -158.0, "logps/rejected": -149.0, "loss": 1.3493, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.072265625, "rewards/margins": 0.03662109375, "rewards/rejected": 0.03564453125, "step": 250 }, { "epoch": 0.017991834475122828, "grad_norm": 4.805901960929202, "learning_rate": 8.990318118948823e-08, "logits/chosen": -1.0234375, "logits/rejected": -0.490234375, "logps/chosen": -167.0, "logps/rejected": -144.0, "loss": 1.3451, "rewards/accuracies": 0.8125, "rewards/chosen": 0.0859375, "rewards/margins": 0.052001953125, "rewards/rejected": 0.033935546875, "step": 260 }, { "epoch": 0.0186838281087814, "grad_norm": 4.572835651988581, "learning_rate": 9.33609958506224e-08, "logits/chosen": -0.890625, "logits/rejected": -0.37109375, "logps/chosen": -175.0, "logps/rejected": -160.0, "loss": 1.3384, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.08984375, "rewards/margins": 0.052490234375, "rewards/rejected": 0.037109375, "step": 270 }, { "epoch": 0.01937582174243997, "grad_norm": 4.541936394387127, "learning_rate": 9.681881051175657e-08, "logits/chosen": -0.97265625, "logits/rejected": -0.44921875, "logps/chosen": -151.0, "logps/rejected": -170.0, "loss": 1.3368, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.09423828125, "rewards/margins": 0.05615234375, "rewards/rejected": 0.0380859375, "step": 280 }, { "epoch": 0.02006781537609854, "grad_norm": 4.382817815803479, "learning_rate": 1.0027662517289074e-07, "logits/chosen": -0.9921875, "logits/rejected": -0.54296875, "logps/chosen": -166.0, "logps/rejected": -152.0, "loss": 1.332, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08984375, "rewards/margins": 0.044189453125, "rewards/rejected": 0.045654296875, "step": 290 }, { "epoch": 0.02075980900975711, "grad_norm": 4.543816450041409, "learning_rate": 1.037344398340249e-07, "logits/chosen": -1.0390625, "logits/rejected": -0.48046875, "logps/chosen": -147.0, "logps/rejected": -150.0, "loss": 1.323, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.0966796875, "rewards/margins": 0.06396484375, "rewards/rejected": 0.0322265625, "step": 300 }, { "epoch": 0.02145180264341568, "grad_norm": 4.964013546690354, "learning_rate": 1.0719225449515904e-07, "logits/chosen": -0.9609375, "logits/rejected": -0.67578125, "logps/chosen": -152.0, "logps/rejected": -165.0, "loss": 1.3214, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.1025390625, "rewards/margins": 0.0615234375, "rewards/rejected": 0.041259765625, "step": 310 }, { "epoch": 0.02214379627707425, "grad_norm": 5.264595987394951, "learning_rate": 1.1065006915629321e-07, "logits/chosen": -0.94921875, "logits/rejected": -0.4453125, "logps/chosen": -167.0, "logps/rejected": -170.0, "loss": 1.3109, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.123046875, "rewards/margins": 0.09375, "rewards/rejected": 0.0291748046875, "step": 320 }, { "epoch": 0.02283578991073282, "grad_norm": 4.664721607605645, "learning_rate": 1.1410788381742738e-07, "logits/chosen": -1.1484375, "logits/rejected": -0.58203125, "logps/chosen": -157.0, "logps/rejected": -162.0, "loss": 1.3024, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.109375, "rewards/margins": 0.08447265625, "rewards/rejected": 0.0244140625, "step": 330 }, { "epoch": 0.023527783544391393, "grad_norm": 4.913585706176484, "learning_rate": 1.1756569847856154e-07, "logits/chosen": -1.1953125, "logits/rejected": -0.6015625, "logps/chosen": -185.0, "logps/rejected": -186.0, "loss": 1.2787, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": 0.12890625, "rewards/margins": 0.1376953125, "rewards/rejected": -0.00799560546875, "step": 340 }, { "epoch": 0.02421977717804996, "grad_norm": 5.030983163238599, "learning_rate": 1.210235131396957e-07, "logits/chosen": -1.1328125, "logits/rejected": -0.6484375, "logps/chosen": -168.0, "logps/rejected": -156.0, "loss": 1.2751, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": 0.1142578125, "rewards/margins": 0.1123046875, "rewards/rejected": 0.001861572265625, "step": 350 }, { "epoch": 0.024911770811708533, "grad_norm": 4.990080668857401, "learning_rate": 1.2448132780082988e-07, "logits/chosen": -1.1953125, "logits/rejected": -0.7265625, "logps/chosen": -178.0, "logps/rejected": -171.0, "loss": 1.2649, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.10791015625, "rewards/margins": 0.1337890625, "rewards/rejected": -0.0260009765625, "step": 360 }, { "epoch": 0.025603764445367102, "grad_norm": 5.300235338961517, "learning_rate": 1.2793914246196402e-07, "logits/chosen": -1.2265625, "logits/rejected": -0.609375, "logps/chosen": -192.0, "logps/rejected": -186.0, "loss": 1.2476, "rewards/accuracies": 0.856249988079071, "rewards/chosen": 0.109375, "rewards/margins": 0.189453125, "rewards/rejected": -0.080078125, "step": 370 }, { "epoch": 0.026295758079025674, "grad_norm": 4.909774936540013, "learning_rate": 1.313969571230982e-07, "logits/chosen": -1.125, "logits/rejected": -0.6015625, "logps/chosen": -175.0, "logps/rejected": -178.0, "loss": 1.2457, "rewards/accuracies": 0.731249988079071, "rewards/chosen": 0.07373046875, "rewards/margins": 0.1455078125, "rewards/rejected": -0.07177734375, "step": 380 }, { "epoch": 0.026987751712684242, "grad_norm": 5.095610398967792, "learning_rate": 1.3485477178423235e-07, "logits/chosen": -1.140625, "logits/rejected": -0.78515625, "logps/chosen": -174.0, "logps/rejected": -192.0, "loss": 1.2221, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.0732421875, "rewards/margins": 0.1748046875, "rewards/rejected": -0.1015625, "step": 390 }, { "epoch": 0.027679745346342814, "grad_norm": 5.106300383410246, "learning_rate": 1.3831258644536652e-07, "logits/chosen": -1.4375, "logits/rejected": -0.796875, "logps/chosen": -192.0, "logps/rejected": -200.0, "loss": 1.2006, "rewards/accuracies": 0.768750011920929, "rewards/chosen": 0.033935546875, "rewards/margins": 0.23828125, "rewards/rejected": -0.205078125, "step": 400 }, { "epoch": 0.028371738980001383, "grad_norm": 5.5836766765532975, "learning_rate": 1.4177040110650069e-07, "logits/chosen": -1.3828125, "logits/rejected": -0.89453125, "logps/chosen": -198.0, "logps/rejected": -176.0, "loss": 1.1873, "rewards/accuracies": 0.78125, "rewards/chosen": -0.018798828125, "rewards/margins": 0.220703125, "rewards/rejected": -0.2392578125, "step": 410 }, { "epoch": 0.029063732613659955, "grad_norm": 5.95606778076522, "learning_rate": 1.4522821576763485e-07, "logits/chosen": -1.234375, "logits/rejected": -0.74609375, "logps/chosen": -186.0, "logps/rejected": -226.0, "loss": 1.1697, "rewards/accuracies": 0.78125, "rewards/chosen": -0.04736328125, "rewards/margins": 0.2421875, "rewards/rejected": -0.2890625, "step": 420 }, { "epoch": 0.029755726247318523, "grad_norm": 5.900679210659443, "learning_rate": 1.4868603042876902e-07, "logits/chosen": -1.3359375, "logits/rejected": -0.8984375, "logps/chosen": -190.0, "logps/rejected": -232.0, "loss": 1.1405, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06494140625, "rewards/margins": 0.353515625, "rewards/rejected": -0.41796875, "step": 430 }, { "epoch": 0.030447719880977095, "grad_norm": 6.257392033835001, "learning_rate": 1.5214384508990316e-07, "logits/chosen": -1.4140625, "logits/rejected": -0.8046875, "logps/chosen": -181.0, "logps/rejected": -196.0, "loss": 1.1219, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.09228515625, "rewards/margins": 0.357421875, "rewards/rejected": -0.44921875, "step": 440 }, { "epoch": 0.031139713514635667, "grad_norm": 6.75524768067718, "learning_rate": 1.5560165975103733e-07, "logits/chosen": -1.453125, "logits/rejected": -1.0234375, "logps/chosen": -188.0, "logps/rejected": -215.0, "loss": 1.119, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.1416015625, "rewards/margins": 0.349609375, "rewards/rejected": -0.4921875, "step": 450 }, { "epoch": 0.03183170714829424, "grad_norm": 5.973106346355359, "learning_rate": 1.590594744121715e-07, "logits/chosen": -1.5546875, "logits/rejected": -1.0859375, "logps/chosen": -206.0, "logps/rejected": -220.0, "loss": 1.1004, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.1826171875, "rewards/margins": 0.419921875, "rewards/rejected": -0.6015625, "step": 460 }, { "epoch": 0.032523700781952804, "grad_norm": 8.794760744665457, "learning_rate": 1.6251728907330566e-07, "logits/chosen": -1.515625, "logits/rejected": -0.9296875, "logps/chosen": -206.0, "logps/rejected": -231.0, "loss": 1.0793, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.244140625, "rewards/margins": 0.330078125, "rewards/rejected": -0.57421875, "step": 470 }, { "epoch": 0.033215694415611376, "grad_norm": 6.158767422144694, "learning_rate": 1.6597510373443983e-07, "logits/chosen": -1.4765625, "logits/rejected": -1.03125, "logps/chosen": -193.0, "logps/rejected": -231.0, "loss": 1.0631, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.267578125, "rewards/margins": 0.4140625, "rewards/rejected": -0.6796875, "step": 480 }, { "epoch": 0.03390768804926995, "grad_norm": 7.293529933058435, "learning_rate": 1.69432918395574e-07, "logits/chosen": -1.6328125, "logits/rejected": -1.203125, "logps/chosen": -207.0, "logps/rejected": -244.0, "loss": 1.0301, "rewards/accuracies": 0.78125, "rewards/chosen": -0.298828125, "rewards/margins": 0.43359375, "rewards/rejected": -0.734375, "step": 490 }, { "epoch": 0.03459968168292852, "grad_norm": 7.42478597112603, "learning_rate": 1.7289073305670816e-07, "logits/chosen": -1.53125, "logits/rejected": -1.0078125, "logps/chosen": -213.0, "logps/rejected": -264.0, "loss": 1.0051, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.310546875, "rewards/margins": 0.60546875, "rewards/rejected": -0.91796875, "step": 500 }, { "epoch": 0.035291675316587084, "grad_norm": 7.6628540820494635, "learning_rate": 1.7634854771784233e-07, "logits/chosen": -1.6328125, "logits/rejected": -1.078125, "logps/chosen": -223.0, "logps/rejected": -244.0, "loss": 1.0106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.318359375, "rewards/margins": 0.5, "rewards/rejected": -0.8203125, "step": 510 }, { "epoch": 0.035983668950245656, "grad_norm": 8.137197812621295, "learning_rate": 1.7980636237897647e-07, "logits/chosen": -1.5390625, "logits/rejected": -1.0546875, "logps/chosen": -206.0, "logps/rejected": -256.0, "loss": 0.9818, "rewards/accuracies": 0.8125, "rewards/chosen": -0.38671875, "rewards/margins": 0.5703125, "rewards/rejected": -0.95703125, "step": 520 }, { "epoch": 0.03667566258390423, "grad_norm": 10.57221573628253, "learning_rate": 1.8326417704011064e-07, "logits/chosen": -1.640625, "logits/rejected": -1.2578125, "logps/chosen": -239.0, "logps/rejected": -268.0, "loss": 1.026, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.482421875, "rewards/margins": 0.5, "rewards/rejected": -0.984375, "step": 530 }, { "epoch": 0.0373676562175628, "grad_norm": 8.93792517377371, "learning_rate": 1.867219917012448e-07, "logits/chosen": -1.6640625, "logits/rejected": -1.078125, "logps/chosen": -218.0, "logps/rejected": -255.0, "loss": 0.9778, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.451171875, "rewards/margins": 0.56640625, "rewards/rejected": -1.015625, "step": 540 }, { "epoch": 0.03805964985122137, "grad_norm": 15.112623680672257, "learning_rate": 1.9017980636237897e-07, "logits/chosen": -1.6328125, "logits/rejected": -1.125, "logps/chosen": -233.0, "logps/rejected": -294.0, "loss": 0.9697, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.421875, "rewards/margins": 0.7578125, "rewards/rejected": -1.1796875, "step": 550 }, { "epoch": 0.03875164348487994, "grad_norm": 14.744569631015363, "learning_rate": 1.9363762102351314e-07, "logits/chosen": -1.609375, "logits/rejected": -1.21875, "logps/chosen": -221.0, "logps/rejected": -274.0, "loss": 0.9239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.45703125, "rewards/margins": 0.69921875, "rewards/rejected": -1.15625, "step": 560 }, { "epoch": 0.03944363711853851, "grad_norm": 10.262911309206409, "learning_rate": 1.9709543568464728e-07, "logits/chosen": -1.6640625, "logits/rejected": -1.296875, "logps/chosen": -224.0, "logps/rejected": -251.0, "loss": 0.957, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.4921875, "rewards/margins": 0.6328125, "rewards/rejected": -1.125, "step": 570 }, { "epoch": 0.04013563075219708, "grad_norm": 9.450777255551845, "learning_rate": 2.0055325034578147e-07, "logits/chosen": -1.75, "logits/rejected": -1.328125, "logps/chosen": -231.0, "logps/rejected": -260.0, "loss": 0.963, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -0.421875, "rewards/margins": 0.71484375, "rewards/rejected": -1.140625, "step": 580 }, { "epoch": 0.04082762438585565, "grad_norm": 12.710990218035963, "learning_rate": 2.040110650069156e-07, "logits/chosen": -1.671875, "logits/rejected": -1.28125, "logps/chosen": -234.0, "logps/rejected": -280.0, "loss": 0.9659, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.62890625, "rewards/margins": 0.6171875, "rewards/rejected": -1.25, "step": 590 }, { "epoch": 0.04151961801951422, "grad_norm": 12.127488459261162, "learning_rate": 2.074688796680498e-07, "logits/chosen": -1.7109375, "logits/rejected": -1.2890625, "logps/chosen": -241.0, "logps/rejected": -302.0, "loss": 0.959, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.68359375, "rewards/margins": 0.66015625, "rewards/rejected": -1.34375, "step": 600 }, { "epoch": 0.04221161165317279, "grad_norm": 12.466096897514635, "learning_rate": 2.1092669432918394e-07, "logits/chosen": -1.6953125, "logits/rejected": -1.2421875, "logps/chosen": -239.0, "logps/rejected": -304.0, "loss": 0.916, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.52734375, "rewards/margins": 0.75, "rewards/rejected": -1.2734375, "step": 610 }, { "epoch": 0.04290360528683136, "grad_norm": 9.104928427821097, "learning_rate": 2.1438450899031809e-07, "logits/chosen": -1.75, "logits/rejected": -1.2578125, "logps/chosen": -247.0, "logps/rejected": -324.0, "loss": 0.8936, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -0.54296875, "rewards/margins": 0.83984375, "rewards/rejected": -1.3828125, "step": 620 }, { "epoch": 0.043595598920489934, "grad_norm": 19.682211294670893, "learning_rate": 2.1784232365145228e-07, "logits/chosen": -1.84375, "logits/rejected": -1.3984375, "logps/chosen": -266.0, "logps/rejected": -314.0, "loss": 0.9302, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.68359375, "rewards/margins": 0.78515625, "rewards/rejected": -1.46875, "step": 630 }, { "epoch": 0.0442875925541485, "grad_norm": 13.008305480140363, "learning_rate": 2.2130013831258642e-07, "logits/chosen": -1.703125, "logits/rejected": -1.375, "logps/chosen": -238.0, "logps/rejected": -308.0, "loss": 0.9099, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -0.58984375, "rewards/margins": 0.84375, "rewards/rejected": -1.4375, "step": 640 }, { "epoch": 0.04497958618780707, "grad_norm": 17.329105831664936, "learning_rate": 2.247579529737206e-07, "logits/chosen": -1.703125, "logits/rejected": -1.2890625, "logps/chosen": -234.0, "logps/rejected": -328.0, "loss": 0.9255, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.58984375, "rewards/margins": 0.94921875, "rewards/rejected": -1.5390625, "step": 650 }, { "epoch": 0.04567157982146564, "grad_norm": 17.310454240717803, "learning_rate": 2.2821576763485475e-07, "logits/chosen": -1.8359375, "logits/rejected": -1.546875, "logps/chosen": -264.0, "logps/rejected": -330.0, "loss": 0.8887, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.8359375, "rewards/margins": 0.6328125, "rewards/rejected": -1.46875, "step": 660 }, { "epoch": 0.046363573455124214, "grad_norm": 13.807267353772332, "learning_rate": 2.3167358229598895e-07, "logits/chosen": -1.890625, "logits/rejected": -1.421875, "logps/chosen": -262.0, "logps/rejected": -348.0, "loss": 0.8796, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -0.70703125, "rewards/margins": 1.109375, "rewards/rejected": -1.8203125, "step": 670 }, { "epoch": 0.047055567088782786, "grad_norm": 31.42001992334512, "learning_rate": 2.3513139695712309e-07, "logits/chosen": -1.6875, "logits/rejected": -1.28125, "logps/chosen": -223.0, "logps/rejected": -314.0, "loss": 0.8714, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.67578125, "rewards/margins": 0.9296875, "rewards/rejected": -1.6015625, "step": 680 }, { "epoch": 0.04774756072244135, "grad_norm": 15.991517049527705, "learning_rate": 2.3858921161825725e-07, "logits/chosen": -1.75, "logits/rejected": -1.3359375, "logps/chosen": -255.0, "logps/rejected": -316.0, "loss": 0.8707, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.87109375, "rewards/margins": 0.77734375, "rewards/rejected": -1.6484375, "step": 690 }, { "epoch": 0.04843955435609992, "grad_norm": 14.17814362819406, "learning_rate": 2.420470262793914e-07, "logits/chosen": -1.828125, "logits/rejected": -1.4140625, "logps/chosen": -266.0, "logps/rejected": -362.0, "loss": 0.8369, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7421875, "rewards/margins": 1.0625, "rewards/rejected": -1.8046875, "step": 700 }, { "epoch": 0.049131547989758495, "grad_norm": 15.298080479865, "learning_rate": 2.455048409405256e-07, "logits/chosen": -1.8203125, "logits/rejected": -1.328125, "logps/chosen": -274.0, "logps/rejected": -374.0, "loss": 0.8801, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9453125, "rewards/margins": 0.91796875, "rewards/rejected": -1.859375, "step": 710 }, { "epoch": 0.04982354162341707, "grad_norm": 16.200765427694716, "learning_rate": 2.4896265560165975e-07, "logits/chosen": -1.7578125, "logits/rejected": -1.3515625, "logps/chosen": -237.0, "logps/rejected": -378.0, "loss": 0.809, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6796875, "rewards/margins": 1.265625, "rewards/rejected": -1.9453125, "step": 720 }, { "epoch": 0.05051553525707563, "grad_norm": 20.359162676268923, "learning_rate": 2.5242047026279387e-07, "logits/chosen": -1.8125, "logits/rejected": -1.4453125, "logps/chosen": -282.0, "logps/rejected": -376.0, "loss": 0.871, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.94921875, "rewards/margins": 1.03125, "rewards/rejected": -1.984375, "step": 730 }, { "epoch": 0.051207528890734204, "grad_norm": 16.70772785636123, "learning_rate": 2.5587828492392804e-07, "logits/chosen": -1.7890625, "logits/rejected": -1.3671875, "logps/chosen": -260.0, "logps/rejected": -340.0, "loss": 0.833, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.99609375, "rewards/margins": 0.859375, "rewards/rejected": -1.8515625, "step": 740 }, { "epoch": 0.051899522524392776, "grad_norm": 15.53601815759455, "learning_rate": 2.5933609958506226e-07, "logits/chosen": -1.9296875, "logits/rejected": -1.3671875, "logps/chosen": -300.0, "logps/rejected": -364.0, "loss": 0.8595, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1015625, "rewards/margins": 0.98046875, "rewards/rejected": -2.078125, "step": 750 }, { "epoch": 0.05259151615805135, "grad_norm": 16.175139586169205, "learning_rate": 2.627939142461964e-07, "logits/chosen": -1.78125, "logits/rejected": -1.53125, "logps/chosen": -248.0, "logps/rejected": -348.0, "loss": 0.8509, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.83203125, "rewards/margins": 0.96484375, "rewards/rejected": -1.796875, "step": 760 }, { "epoch": 0.05328350979170992, "grad_norm": 17.296078581950976, "learning_rate": 2.6625172890733054e-07, "logits/chosen": -1.8359375, "logits/rejected": -1.40625, "logps/chosen": -258.0, "logps/rejected": -362.0, "loss": 0.7847, "rewards/accuracies": 0.875, "rewards/chosen": -0.859375, "rewards/margins": 1.1875, "rewards/rejected": -2.046875, "step": 770 }, { "epoch": 0.053975503425368485, "grad_norm": 15.423337337677815, "learning_rate": 2.697095435684647e-07, "logits/chosen": -1.84375, "logits/rejected": -1.5234375, "logps/chosen": -264.0, "logps/rejected": -372.0, "loss": 0.8352, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -0.953125, "rewards/margins": 1.0625, "rewards/rejected": -2.015625, "step": 780 }, { "epoch": 0.054667497059027056, "grad_norm": 17.509849225119204, "learning_rate": 2.7316735822959887e-07, "logits/chosen": -1.90625, "logits/rejected": -1.453125, "logps/chosen": -304.0, "logps/rejected": -388.0, "loss": 0.8387, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.1171875, "rewards/margins": 1.140625, "rewards/rejected": -2.265625, "step": 790 }, { "epoch": 0.05535949069268563, "grad_norm": 17.926216358432697, "learning_rate": 2.7662517289073304e-07, "logits/chosen": -1.921875, "logits/rejected": -1.5234375, "logps/chosen": -270.0, "logps/rejected": -384.0, "loss": 0.7739, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.0546875, "rewards/margins": 1.1640625, "rewards/rejected": -2.21875, "step": 800 }, { "epoch": 0.0560514843263442, "grad_norm": 20.37464843868995, "learning_rate": 2.800829875518672e-07, "logits/chosen": -1.9140625, "logits/rejected": -1.6015625, "logps/chosen": -278.0, "logps/rejected": -382.0, "loss": 0.8224, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.1328125, "rewards/margins": 1.1015625, "rewards/rejected": -2.234375, "step": 810 }, { "epoch": 0.056743477960002765, "grad_norm": 24.269351690658702, "learning_rate": 2.8354080221300137e-07, "logits/chosen": -1.796875, "logits/rejected": -1.515625, "logps/chosen": -276.0, "logps/rejected": -362.0, "loss": 0.796, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0, "rewards/margins": 1.046875, "rewards/rejected": -2.046875, "step": 820 }, { "epoch": 0.05743547159366134, "grad_norm": 26.86215219336741, "learning_rate": 2.8699861687413554e-07, "logits/chosen": -1.875, "logits/rejected": -1.5703125, "logps/chosen": -286.0, "logps/rejected": -386.0, "loss": 0.7697, "rewards/accuracies": 0.8125, "rewards/chosen": -1.046875, "rewards/margins": 1.1875, "rewards/rejected": -2.234375, "step": 830 }, { "epoch": 0.05812746522731991, "grad_norm": 19.778118006596745, "learning_rate": 2.904564315352697e-07, "logits/chosen": -1.9453125, "logits/rejected": -1.4765625, "logps/chosen": -292.0, "logps/rejected": -410.0, "loss": 0.7561, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1328125, "rewards/margins": 1.3125, "rewards/rejected": -2.453125, "step": 840 }, { "epoch": 0.05881945886097848, "grad_norm": 23.56070937580022, "learning_rate": 2.9391424619640387e-07, "logits/chosen": -2.015625, "logits/rejected": -1.6015625, "logps/chosen": -294.0, "logps/rejected": -386.0, "loss": 0.7934, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.1328125, "rewards/margins": 1.140625, "rewards/rejected": -2.265625, "step": 850 }, { "epoch": 0.059511452494637046, "grad_norm": 14.649912828061758, "learning_rate": 2.9737206085753804e-07, "logits/chosen": -1.8515625, "logits/rejected": -1.5390625, "logps/chosen": -286.0, "logps/rejected": -374.0, "loss": 0.8109, "rewards/accuracies": 0.8125, "rewards/chosen": -1.140625, "rewards/margins": 1.0234375, "rewards/rejected": -2.171875, "step": 860 }, { "epoch": 0.06020344612829562, "grad_norm": 17.622496120390437, "learning_rate": 3.0082987551867215e-07, "logits/chosen": -1.953125, "logits/rejected": -1.6640625, "logps/chosen": -320.0, "logps/rejected": -408.0, "loss": 0.7932, "rewards/accuracies": 0.84375, "rewards/chosen": -1.21875, "rewards/margins": 1.1640625, "rewards/rejected": -2.390625, "step": 870 }, { "epoch": 0.06089543976195419, "grad_norm": 18.149777704289054, "learning_rate": 3.042876901798063e-07, "logits/chosen": -2.03125, "logits/rejected": -1.6484375, "logps/chosen": -280.0, "logps/rejected": -408.0, "loss": 0.7457, "rewards/accuracies": 0.84375, "rewards/chosen": -1.140625, "rewards/margins": 1.265625, "rewards/rejected": -2.40625, "step": 880 }, { "epoch": 0.06158743339561276, "grad_norm": 18.7413496539697, "learning_rate": 3.0774550484094054e-07, "logits/chosen": -1.9609375, "logits/rejected": -1.5859375, "logps/chosen": -308.0, "logps/rejected": -400.0, "loss": 0.7318, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.21875, "rewards/margins": 1.28125, "rewards/rejected": -2.5, "step": 890 }, { "epoch": 0.062279427029271334, "grad_norm": 21.765051500878666, "learning_rate": 3.1120331950207465e-07, "logits/chosen": -2.015625, "logits/rejected": -1.671875, "logps/chosen": -312.0, "logps/rejected": -424.0, "loss": 0.7134, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.2890625, "rewards/margins": 1.3359375, "rewards/rejected": -2.625, "step": 900 }, { "epoch": 0.0629714206629299, "grad_norm": 25.259699103189305, "learning_rate": 3.146611341632088e-07, "logits/chosen": -2.0625, "logits/rejected": -1.7109375, "logps/chosen": -300.0, "logps/rejected": -430.0, "loss": 0.7459, "rewards/accuracies": 0.875, "rewards/chosen": -1.21875, "rewards/margins": 1.4296875, "rewards/rejected": -2.640625, "step": 910 }, { "epoch": 0.06366341429658848, "grad_norm": 21.075177435330758, "learning_rate": 3.18118948824343e-07, "logits/chosen": -2.078125, "logits/rejected": -1.6875, "logps/chosen": -284.0, "logps/rejected": -410.0, "loss": 0.7177, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.1015625, "rewards/margins": 1.375, "rewards/rejected": -2.484375, "step": 920 }, { "epoch": 0.06435540793024704, "grad_norm": 20.41098368521284, "learning_rate": 3.215767634854772e-07, "logits/chosen": -2.0625, "logits/rejected": -1.6640625, "logps/chosen": -302.0, "logps/rejected": -432.0, "loss": 0.7313, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.2578125, "rewards/margins": 1.4921875, "rewards/rejected": -2.75, "step": 930 }, { "epoch": 0.06504740156390561, "grad_norm": 14.801858118994366, "learning_rate": 3.250345781466113e-07, "logits/chosen": -2.03125, "logits/rejected": -1.671875, "logps/chosen": -304.0, "logps/rejected": -450.0, "loss": 0.6781, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.2265625, "rewards/margins": 1.484375, "rewards/rejected": -2.703125, "step": 940 }, { "epoch": 0.06573939519756418, "grad_norm": 19.965385154800575, "learning_rate": 3.284923928077455e-07, "logits/chosen": -2.0625, "logits/rejected": -1.6328125, "logps/chosen": -298.0, "logps/rejected": -454.0, "loss": 0.7621, "rewards/accuracies": 0.875, "rewards/chosen": -1.2578125, "rewards/margins": 1.609375, "rewards/rejected": -2.859375, "step": 950 }, { "epoch": 0.06643138883122275, "grad_norm": 16.697933503190274, "learning_rate": 3.3195020746887966e-07, "logits/chosen": -2.140625, "logits/rejected": -1.71875, "logps/chosen": -308.0, "logps/rejected": -410.0, "loss": 0.7356, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -1.4140625, "rewards/margins": 1.28125, "rewards/rejected": -2.703125, "step": 960 }, { "epoch": 0.06712338246488132, "grad_norm": 17.94255265681201, "learning_rate": 3.3540802213001377e-07, "logits/chosen": -2.0625, "logits/rejected": -1.71875, "logps/chosen": -318.0, "logps/rejected": -440.0, "loss": 0.7099, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.40625, "rewards/margins": 1.4140625, "rewards/rejected": -2.8125, "step": 970 }, { "epoch": 0.0678153760985399, "grad_norm": 15.655781893024331, "learning_rate": 3.38865836791148e-07, "logits/chosen": -2.140625, "logits/rejected": -1.6484375, "logps/chosen": -318.0, "logps/rejected": -436.0, "loss": 0.6919, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.3359375, "rewards/margins": 1.6015625, "rewards/rejected": -2.9375, "step": 980 }, { "epoch": 0.06850736973219847, "grad_norm": 20.395981005808103, "learning_rate": 3.4232365145228216e-07, "logits/chosen": -2.125, "logits/rejected": -1.6171875, "logps/chosen": -288.0, "logps/rejected": -434.0, "loss": 0.6806, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.2109375, "rewards/margins": 1.6015625, "rewards/rejected": -2.8125, "step": 990 }, { "epoch": 0.06919936336585704, "grad_norm": 22.35885284777077, "learning_rate": 3.457814661134163e-07, "logits/chosen": -2.203125, "logits/rejected": -1.7109375, "logps/chosen": -320.0, "logps/rejected": -464.0, "loss": 0.7397, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -1.46875, "rewards/margins": 1.5078125, "rewards/rejected": -2.96875, "step": 1000 }, { "epoch": 0.06919936336585704, "eval_logits/chosen": -2.109375, "eval_logits/rejected": -1.75, "eval_logps/chosen": -350.0, "eval_logps/rejected": -472.0, "eval_loss": 0.36021560430526733, "eval_rewards/accuracies": 0.8426259160041809, "eval_rewards/chosen": -1.625, "eval_rewards/margins": 1.4609375, "eval_rewards/rejected": -3.078125, "eval_runtime": 2935.6476, "eval_samples_per_second": 33.326, "eval_steps_per_second": 0.521, "step": 1000 }, { "epoch": 0.06989135699951561, "grad_norm": 18.532310516149558, "learning_rate": 3.4923928077455044e-07, "logits/chosen": -2.140625, "logits/rejected": -1.6875, "logps/chosen": -310.0, "logps/rejected": -448.0, "loss": 0.6658, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.46875, "rewards/margins": 1.515625, "rewards/rejected": -2.984375, "step": 1010 }, { "epoch": 0.07058335063317417, "grad_norm": 19.234065283718696, "learning_rate": 3.5269709543568466e-07, "logits/chosen": -2.046875, "logits/rejected": -1.7578125, "logps/chosen": -306.0, "logps/rejected": -454.0, "loss": 0.7041, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -1.3125, "rewards/margins": 1.6015625, "rewards/rejected": -2.90625, "step": 1020 }, { "epoch": 0.07127534426683274, "grad_norm": 19.397948245714275, "learning_rate": 3.561549100968188e-07, "logits/chosen": -2.09375, "logits/rejected": -1.796875, "logps/chosen": -296.0, "logps/rejected": -438.0, "loss": 0.6423, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.25, "rewards/margins": 1.6640625, "rewards/rejected": -2.921875, "step": 1030 }, { "epoch": 0.07196733790049131, "grad_norm": 17.992889045045672, "learning_rate": 3.5961272475795294e-07, "logits/chosen": -2.21875, "logits/rejected": -1.796875, "logps/chosen": -304.0, "logps/rejected": -472.0, "loss": 0.6452, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.3828125, "rewards/margins": 1.6953125, "rewards/rejected": -3.078125, "step": 1040 }, { "epoch": 0.07265933153414988, "grad_norm": 20.602065578070444, "learning_rate": 3.630705394190871e-07, "logits/chosen": -2.203125, "logits/rejected": -2.03125, "logps/chosen": -332.0, "logps/rejected": -454.0, "loss": 0.7198, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5078125, "rewards/margins": 1.34375, "rewards/rejected": -2.859375, "step": 1050 }, { "epoch": 0.07335132516780846, "grad_norm": 21.10175066207592, "learning_rate": 3.6652835408022127e-07, "logits/chosen": -2.234375, "logits/rejected": -1.9140625, "logps/chosen": -306.0, "logps/rejected": -478.0, "loss": 0.6814, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.375, "rewards/margins": 1.6953125, "rewards/rejected": -3.078125, "step": 1060 }, { "epoch": 0.07404331880146703, "grad_norm": 19.471538022521713, "learning_rate": 3.6998616874135544e-07, "logits/chosen": -2.203125, "logits/rejected": -1.8828125, "logps/chosen": -314.0, "logps/rejected": -450.0, "loss": 0.6803, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.453125, "rewards/margins": 1.59375, "rewards/rejected": -3.046875, "step": 1070 }, { "epoch": 0.0747353124351256, "grad_norm": 20.09657044960564, "learning_rate": 3.734439834024896e-07, "logits/chosen": -2.234375, "logits/rejected": -1.9296875, "logps/chosen": -310.0, "logps/rejected": -476.0, "loss": 0.6667, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.53125, "rewards/margins": 1.5703125, "rewards/rejected": -3.09375, "step": 1080 }, { "epoch": 0.07542730606878417, "grad_norm": 32.78928615906616, "learning_rate": 3.7690179806362377e-07, "logits/chosen": -2.171875, "logits/rejected": -1.6640625, "logps/chosen": -322.0, "logps/rejected": -494.0, "loss": 0.6387, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.515625, "rewards/margins": 1.6484375, "rewards/rejected": -3.15625, "step": 1090 }, { "epoch": 0.07611929970244274, "grad_norm": 20.631880871838828, "learning_rate": 3.8035961272475794e-07, "logits/chosen": -2.125, "logits/rejected": -1.8203125, "logps/chosen": -342.0, "logps/rejected": -482.0, "loss": 0.7055, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -1.6640625, "rewards/margins": 1.515625, "rewards/rejected": -3.171875, "step": 1100 }, { "epoch": 0.0768112933361013, "grad_norm": 28.11873473535321, "learning_rate": 3.838174273858921e-07, "logits/chosen": -2.125, "logits/rejected": -1.7109375, "logps/chosen": -316.0, "logps/rejected": -478.0, "loss": 0.6405, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.515625, "rewards/margins": 1.7109375, "rewards/rejected": -3.21875, "step": 1110 }, { "epoch": 0.07750328696975987, "grad_norm": 21.288242461531695, "learning_rate": 3.8727524204702627e-07, "logits/chosen": -2.21875, "logits/rejected": -1.9765625, "logps/chosen": -336.0, "logps/rejected": -500.0, "loss": 0.6751, "rewards/accuracies": 0.875, "rewards/chosen": -1.703125, "rewards/margins": 1.6875, "rewards/rejected": -3.390625, "step": 1120 }, { "epoch": 0.07819528060341845, "grad_norm": 33.124117543266884, "learning_rate": 3.9073305670816044e-07, "logits/chosen": -2.3125, "logits/rejected": -1.9453125, "logps/chosen": -348.0, "logps/rejected": -508.0, "loss": 0.6507, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6875, "rewards/margins": 1.8203125, "rewards/rejected": -3.5, "step": 1130 }, { "epoch": 0.07888727423707702, "grad_norm": 19.522932747128824, "learning_rate": 3.9419087136929455e-07, "logits/chosen": -2.203125, "logits/rejected": -1.796875, "logps/chosen": -330.0, "logps/rejected": -520.0, "loss": 0.6474, "rewards/accuracies": 0.90625, "rewards/chosen": -1.59375, "rewards/margins": 1.9375, "rewards/rejected": -3.53125, "step": 1140 }, { "epoch": 0.07957926787073559, "grad_norm": 25.03072461162178, "learning_rate": 3.976486860304287e-07, "logits/chosen": -2.1875, "logits/rejected": -1.7265625, "logps/chosen": -360.0, "logps/rejected": -512.0, "loss": 0.6323, "rewards/accuracies": 0.875, "rewards/chosen": -1.6796875, "rewards/margins": 1.8125, "rewards/rejected": -3.5, "step": 1150 }, { "epoch": 0.08027126150439416, "grad_norm": 27.36170102104842, "learning_rate": 4.0110650069156294e-07, "logits/chosen": -2.25, "logits/rejected": -1.953125, "logps/chosen": -332.0, "logps/rejected": -502.0, "loss": 0.6407, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.7265625, "rewards/margins": 1.8359375, "rewards/rejected": -3.5625, "step": 1160 }, { "epoch": 0.08096325513805273, "grad_norm": 26.26176155002112, "learning_rate": 4.045643153526971e-07, "logits/chosen": -2.109375, "logits/rejected": -1.984375, "logps/chosen": -370.0, "logps/rejected": -500.0, "loss": 0.6704, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9140625, "rewards/margins": 1.546875, "rewards/rejected": -3.46875, "step": 1170 }, { "epoch": 0.0816552487717113, "grad_norm": 18.435552971244647, "learning_rate": 4.080221300138312e-07, "logits/chosen": -2.171875, "logits/rejected": -1.9453125, "logps/chosen": -334.0, "logps/rejected": -490.0, "loss": 0.6548, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7109375, "rewards/margins": 1.703125, "rewards/rejected": -3.40625, "step": 1180 }, { "epoch": 0.08234724240536986, "grad_norm": 22.26688577037996, "learning_rate": 4.114799446749654e-07, "logits/chosen": -2.203125, "logits/rejected": -1.9375, "logps/chosen": -350.0, "logps/rejected": -520.0, "loss": 0.6625, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7578125, "rewards/margins": 1.8359375, "rewards/rejected": -3.59375, "step": 1190 }, { "epoch": 0.08303923603902844, "grad_norm": 19.966455789510672, "learning_rate": 4.149377593360996e-07, "logits/chosen": -2.140625, "logits/rejected": -1.890625, "logps/chosen": -318.0, "logps/rejected": -482.0, "loss": 0.6311, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.6015625, "rewards/margins": 1.6484375, "rewards/rejected": -3.25, "step": 1200 }, { "epoch": 0.08373122967268701, "grad_norm": 22.18668360504688, "learning_rate": 4.183955739972337e-07, "logits/chosen": -2.375, "logits/rejected": -1.9609375, "logps/chosen": -350.0, "logps/rejected": -532.0, "loss": 0.6155, "rewards/accuracies": 0.875, "rewards/chosen": -1.7109375, "rewards/margins": 2.03125, "rewards/rejected": -3.75, "step": 1210 }, { "epoch": 0.08442322330634558, "grad_norm": 23.62769102433528, "learning_rate": 4.218533886583679e-07, "logits/chosen": -2.21875, "logits/rejected": -1.9765625, "logps/chosen": -334.0, "logps/rejected": -524.0, "loss": 0.6264, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -1.6015625, "rewards/margins": 2.046875, "rewards/rejected": -3.65625, "step": 1220 }, { "epoch": 0.08511521694000415, "grad_norm": 19.752521596412958, "learning_rate": 4.2531120331950206e-07, "logits/chosen": -2.21875, "logits/rejected": -1.9921875, "logps/chosen": -342.0, "logps/rejected": -498.0, "loss": 0.589, "rewards/accuracies": 0.875, "rewards/chosen": -1.7265625, "rewards/margins": 1.75, "rewards/rejected": -3.46875, "step": 1230 }, { "epoch": 0.08580721057366272, "grad_norm": 20.137661764959226, "learning_rate": 4.2876901798063617e-07, "logits/chosen": -2.296875, "logits/rejected": -2.0625, "logps/chosen": -344.0, "logps/rejected": -512.0, "loss": 0.6442, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.6875, "rewards/margins": 1.8515625, "rewards/rejected": -3.546875, "step": 1240 }, { "epoch": 0.0864992042073213, "grad_norm": 26.007362869069976, "learning_rate": 4.322268326417704e-07, "logits/chosen": -2.34375, "logits/rejected": -2.125, "logps/chosen": -334.0, "logps/rejected": -500.0, "loss": 0.5911, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.6875, "rewards/margins": 1.890625, "rewards/rejected": -3.578125, "step": 1250 }, { "epoch": 0.08719119784097987, "grad_norm": 20.190465462874602, "learning_rate": 4.3568464730290456e-07, "logits/chosen": -2.390625, "logits/rejected": -2.109375, "logps/chosen": -372.0, "logps/rejected": -540.0, "loss": 0.5731, "rewards/accuracies": 0.84375, "rewards/chosen": -1.9453125, "rewards/margins": 1.953125, "rewards/rejected": -3.890625, "step": 1260 }, { "epoch": 0.08788319147463844, "grad_norm": 20.54727528249839, "learning_rate": 4.391424619640387e-07, "logits/chosen": -2.265625, "logits/rejected": -1.953125, "logps/chosen": -362.0, "logps/rejected": -540.0, "loss": 0.6402, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.84375, "rewards/margins": 1.7734375, "rewards/rejected": -3.625, "step": 1270 }, { "epoch": 0.088575185108297, "grad_norm": 19.866433464820364, "learning_rate": 4.4260027662517284e-07, "logits/chosen": -2.375, "logits/rejected": -2.109375, "logps/chosen": -390.0, "logps/rejected": -536.0, "loss": 0.6419, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.109375, "rewards/margins": 1.640625, "rewards/rejected": -3.75, "step": 1280 }, { "epoch": 0.08926717874195557, "grad_norm": 17.357784504616497, "learning_rate": 4.4605809128630706e-07, "logits/chosen": -2.296875, "logits/rejected": -2.078125, "logps/chosen": -368.0, "logps/rejected": -576.0, "loss": 0.6206, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -1.8515625, "rewards/margins": 2.0625, "rewards/rejected": -3.90625, "step": 1290 }, { "epoch": 0.08995917237561414, "grad_norm": 25.670671019689237, "learning_rate": 4.495159059474412e-07, "logits/chosen": -2.28125, "logits/rejected": -2.015625, "logps/chosen": -364.0, "logps/rejected": -544.0, "loss": 0.61, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -1.875, "rewards/margins": 2.03125, "rewards/rejected": -3.90625, "step": 1300 }, { "epoch": 0.09065116600927271, "grad_norm": 22.832452331861724, "learning_rate": 4.5297372060857534e-07, "logits/chosen": -2.40625, "logits/rejected": -2.234375, "logps/chosen": -348.0, "logps/rejected": -548.0, "loss": 0.643, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.8125, "rewards/margins": 2.015625, "rewards/rejected": -3.828125, "step": 1310 }, { "epoch": 0.09134315964293128, "grad_norm": 16.502502016449867, "learning_rate": 4.564315352697095e-07, "logits/chosen": -2.28125, "logits/rejected": -2.078125, "logps/chosen": -366.0, "logps/rejected": -560.0, "loss": 0.6062, "rewards/accuracies": 0.875, "rewards/chosen": -1.8203125, "rewards/margins": 2.140625, "rewards/rejected": -3.96875, "step": 1320 }, { "epoch": 0.09203515327658986, "grad_norm": 21.106501122704845, "learning_rate": 4.5988934993084367e-07, "logits/chosen": -2.265625, "logits/rejected": -2.015625, "logps/chosen": -346.0, "logps/rejected": -540.0, "loss": 0.559, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -1.7265625, "rewards/margins": 2.125, "rewards/rejected": -3.859375, "step": 1330 }, { "epoch": 0.09272714691024843, "grad_norm": 18.78768099126919, "learning_rate": 4.633471645919779e-07, "logits/chosen": -2.453125, "logits/rejected": -2.1875, "logps/chosen": -358.0, "logps/rejected": -552.0, "loss": 0.5974, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -1.8984375, "rewards/margins": 2.171875, "rewards/rejected": -4.09375, "step": 1340 }, { "epoch": 0.093419140543907, "grad_norm": 23.09024996653307, "learning_rate": 4.66804979253112e-07, "logits/chosen": -2.4375, "logits/rejected": -2.234375, "logps/chosen": -378.0, "logps/rejected": -576.0, "loss": 0.5689, "rewards/accuracies": 0.90625, "rewards/chosen": -2.015625, "rewards/margins": 1.9921875, "rewards/rejected": -4.0, "step": 1350 }, { "epoch": 0.09411113417756557, "grad_norm": 23.657393179701153, "learning_rate": 4.7026279391424617e-07, "logits/chosen": -2.28125, "logits/rejected": -2.03125, "logps/chosen": -374.0, "logps/rejected": -544.0, "loss": 0.6111, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.9296875, "rewards/margins": 1.921875, "rewards/rejected": -3.84375, "step": 1360 }, { "epoch": 0.09480312781122413, "grad_norm": 22.18522211418035, "learning_rate": 4.7372060857538034e-07, "logits/chosen": -2.46875, "logits/rejected": -2.09375, "logps/chosen": -356.0, "logps/rejected": -580.0, "loss": 0.5556, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -1.9375, "rewards/margins": 2.25, "rewards/rejected": -4.1875, "step": 1370 }, { "epoch": 0.0954951214448827, "grad_norm": 17.251648904841016, "learning_rate": 4.771784232365145e-07, "logits/chosen": -2.4375, "logits/rejected": -2.296875, "logps/chosen": -440.0, "logps/rejected": -628.0, "loss": 0.5991, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -2.46875, "rewards/margins": 2.109375, "rewards/rejected": -4.59375, "step": 1380 }, { "epoch": 0.09618711507854127, "grad_norm": 21.596865757910525, "learning_rate": 4.806362378976487e-07, "logits/chosen": -2.421875, "logits/rejected": -2.15625, "logps/chosen": -402.0, "logps/rejected": -612.0, "loss": 0.5628, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -2.03125, "rewards/margins": 2.375, "rewards/rejected": -4.40625, "step": 1390 }, { "epoch": 0.09687910871219985, "grad_norm": 23.16157843205117, "learning_rate": 4.840940525587828e-07, "logits/chosen": -2.53125, "logits/rejected": -2.296875, "logps/chosen": -432.0, "logps/rejected": -604.0, "loss": 0.579, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.75, "rewards/margins": 1.8125, "rewards/rejected": -4.5625, "step": 1400 }, { "epoch": 0.09757110234585842, "grad_norm": 24.450806373271522, "learning_rate": 4.87551867219917e-07, "logits/chosen": -2.390625, "logits/rejected": -2.125, "logps/chosen": -362.0, "logps/rejected": -568.0, "loss": 0.5445, "rewards/accuracies": 0.90625, "rewards/chosen": -1.984375, "rewards/margins": 2.265625, "rewards/rejected": -4.25, "step": 1410 }, { "epoch": 0.09826309597951699, "grad_norm": 21.22959909227978, "learning_rate": 4.910096818810512e-07, "logits/chosen": -2.328125, "logits/rejected": -1.953125, "logps/chosen": -420.0, "logps/rejected": -644.0, "loss": 0.559, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.421875, "rewards/margins": 2.265625, "rewards/rejected": -4.6875, "step": 1420 }, { "epoch": 0.09895508961317556, "grad_norm": 21.612754371623137, "learning_rate": 4.944674965421853e-07, "logits/chosen": -2.375, "logits/rejected": -2.125, "logps/chosen": -392.0, "logps/rejected": -636.0, "loss": 0.5304, "rewards/accuracies": 0.9375, "rewards/chosen": -2.109375, "rewards/margins": 2.53125, "rewards/rejected": -4.625, "step": 1430 }, { "epoch": 0.09964708324683413, "grad_norm": 20.11487493034149, "learning_rate": 4.979253112033195e-07, "logits/chosen": -2.359375, "logits/rejected": -1.953125, "logps/chosen": -412.0, "logps/rejected": -660.0, "loss": 0.5672, "rewards/accuracies": 0.875, "rewards/chosen": -2.421875, "rewards/margins": 2.265625, "rewards/rejected": -4.6875, "step": 1440 }, { "epoch": 0.1003390768804927, "grad_norm": 21.636303879528334, "learning_rate": 4.999998832897514e-07, "logits/chosen": -2.484375, "logits/rejected": -2.09375, "logps/chosen": -442.0, "logps/rejected": -660.0, "loss": 0.5202, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.609375, "rewards/margins": 2.359375, "rewards/rejected": -4.96875, "step": 1450 }, { "epoch": 0.10103107051415126, "grad_norm": 18.44571891671605, "learning_rate": 4.999985703007059e-07, "logits/chosen": -2.40625, "logits/rejected": -2.234375, "logps/chosen": -474.0, "logps/rejected": -760.0, "loss": 0.5372, "rewards/accuracies": 0.90625, "rewards/chosen": -2.875, "rewards/margins": 2.96875, "rewards/rejected": -5.84375, "step": 1460 }, { "epoch": 0.10172306414780984, "grad_norm": 21.568112499657527, "learning_rate": 4.999957984424916e-07, "logits/chosen": -2.40625, "logits/rejected": -2.203125, "logps/chosen": -464.0, "logps/rejected": -740.0, "loss": 0.5135, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -2.890625, "rewards/margins": 2.78125, "rewards/rejected": -5.6875, "step": 1470 }, { "epoch": 0.10241505778146841, "grad_norm": 20.75114420857829, "learning_rate": 4.999915677312838e-07, "logits/chosen": -2.546875, "logits/rejected": -2.328125, "logps/chosen": -486.0, "logps/rejected": -680.0, "loss": 0.5587, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.1875, "rewards/margins": 2.171875, "rewards/rejected": -5.34375, "step": 1480 }, { "epoch": 0.10310705141512698, "grad_norm": 25.94648188159898, "learning_rate": 4.999858781917709e-07, "logits/chosen": -2.515625, "logits/rejected": -2.3125, "logps/chosen": -492.0, "logps/rejected": -732.0, "loss": 0.5374, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.09375, "rewards/margins": 2.578125, "rewards/rejected": -5.6875, "step": 1490 }, { "epoch": 0.10379904504878555, "grad_norm": 20.60004851568036, "learning_rate": 4.99978729857154e-07, "logits/chosen": -2.421875, "logits/rejected": -2.28125, "logps/chosen": -510.0, "logps/rejected": -776.0, "loss": 0.5861, "rewards/accuracies": 0.90625, "rewards/chosen": -3.34375, "rewards/margins": 2.828125, "rewards/rejected": -6.1875, "step": 1500 }, { "epoch": 0.10449103868244412, "grad_norm": 21.48057822623696, "learning_rate": 4.999701227691476e-07, "logits/chosen": -2.484375, "logits/rejected": -2.34375, "logps/chosen": -470.0, "logps/rejected": -692.0, "loss": 0.5449, "rewards/accuracies": 0.8125, "rewards/chosen": -3.078125, "rewards/margins": 2.296875, "rewards/rejected": -5.375, "step": 1510 }, { "epoch": 0.1051830323161027, "grad_norm": 27.79664450326523, "learning_rate": 4.999600569779782e-07, "logits/chosen": -2.5625, "logits/rejected": -2.328125, "logps/chosen": -490.0, "logps/rejected": -772.0, "loss": 0.5206, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.96875, "rewards/margins": 2.890625, "rewards/rejected": -5.875, "step": 1520 }, { "epoch": 0.10587502594976127, "grad_norm": 21.157078702279975, "learning_rate": 4.99948532542385e-07, "logits/chosen": -2.484375, "logits/rejected": -2.125, "logps/chosen": -486.0, "logps/rejected": -724.0, "loss": 0.4932, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.9375, "rewards/margins": 2.65625, "rewards/rejected": -5.59375, "step": 1530 }, { "epoch": 0.10656701958341984, "grad_norm": 20.525423016928038, "learning_rate": 4.999355495296189e-07, "logits/chosen": -2.40625, "logits/rejected": -2.25, "logps/chosen": -512.0, "logps/rejected": -820.0, "loss": 0.5182, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.21875, "rewards/margins": 3.1875, "rewards/rejected": -6.40625, "step": 1540 }, { "epoch": 0.1072590132170784, "grad_norm": 24.1359355520331, "learning_rate": 4.999211080154422e-07, "logits/chosen": -2.65625, "logits/rejected": -2.53125, "logps/chosen": -502.0, "logps/rejected": -732.0, "loss": 0.5771, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.265625, "rewards/margins": 2.484375, "rewards/rejected": -5.75, "step": 1550 }, { "epoch": 0.10795100685073697, "grad_norm": 22.915988787341995, "learning_rate": 4.999052080841289e-07, "logits/chosen": -2.53125, "logits/rejected": -2.125, "logps/chosen": -520.0, "logps/rejected": -784.0, "loss": 0.5638, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.328125, "rewards/margins": 2.578125, "rewards/rejected": -5.90625, "step": 1560 }, { "epoch": 0.10864300048439554, "grad_norm": 17.104985548100547, "learning_rate": 4.99887849828463e-07, "logits/chosen": -2.5, "logits/rejected": -2.375, "logps/chosen": -500.0, "logps/rejected": -756.0, "loss": 0.4887, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.015625, "rewards/margins": 2.8125, "rewards/rejected": -5.8125, "step": 1570 }, { "epoch": 0.10933499411805411, "grad_norm": 23.16188344727809, "learning_rate": 4.998690333497387e-07, "logits/chosen": -2.421875, "logits/rejected": -2.3125, "logps/chosen": -524.0, "logps/rejected": -748.0, "loss": 0.5187, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.46875, "rewards/margins": 2.40625, "rewards/rejected": -5.875, "step": 1580 }, { "epoch": 0.11002698775171268, "grad_norm": 16.645246895566974, "learning_rate": 4.998487587577598e-07, "logits/chosen": -2.5625, "logits/rejected": -2.390625, "logps/chosen": -528.0, "logps/rejected": -800.0, "loss": 0.5347, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.578125, "rewards/margins": 2.703125, "rewards/rejected": -6.28125, "step": 1590 }, { "epoch": 0.11071898138537126, "grad_norm": 24.361693625383438, "learning_rate": 4.99827026170839e-07, "logits/chosen": -2.5, "logits/rejected": -2.28125, "logps/chosen": -486.0, "logps/rejected": -736.0, "loss": 0.5311, "rewards/accuracies": 0.875, "rewards/chosen": -3.140625, "rewards/margins": 2.59375, "rewards/rejected": -5.75, "step": 1600 }, { "epoch": 0.11141097501902983, "grad_norm": 15.569279161995029, "learning_rate": 4.998038357157968e-07, "logits/chosen": -2.34375, "logits/rejected": -2.34375, "logps/chosen": -480.0, "logps/rejected": -760.0, "loss": 0.4899, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.0, "rewards/margins": 2.921875, "rewards/rejected": -5.90625, "step": 1610 }, { "epoch": 0.1121029686526884, "grad_norm": 21.699542344112494, "learning_rate": 4.997791875279615e-07, "logits/chosen": -2.53125, "logits/rejected": -2.46875, "logps/chosen": -478.0, "logps/rejected": -756.0, "loss": 0.4803, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.0, "rewards/margins": 2.953125, "rewards/rejected": -5.9375, "step": 1620 }, { "epoch": 0.11279496228634697, "grad_norm": 20.36765550468458, "learning_rate": 4.997530817511679e-07, "logits/chosen": -2.5625, "logits/rejected": -2.40625, "logps/chosen": -502.0, "logps/rejected": -720.0, "loss": 0.5789, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.125, "rewards/margins": 2.375, "rewards/rejected": -5.5, "step": 1630 }, { "epoch": 0.11348695592000553, "grad_norm": 26.328761013166147, "learning_rate": 4.997255185377564e-07, "logits/chosen": -2.546875, "logits/rejected": -2.4375, "logps/chosen": -496.0, "logps/rejected": -744.0, "loss": 0.4957, "rewards/accuracies": 0.90625, "rewards/chosen": -3.203125, "rewards/margins": 2.765625, "rewards/rejected": -5.96875, "step": 1640 }, { "epoch": 0.1141789495536641, "grad_norm": 23.53281604654297, "learning_rate": 4.996964980485725e-07, "logits/chosen": -2.34375, "logits/rejected": -2.25, "logps/chosen": -480.0, "logps/rejected": -748.0, "loss": 0.5392, "rewards/accuracies": 0.875, "rewards/chosen": -3.203125, "rewards/margins": 2.703125, "rewards/rejected": -5.90625, "step": 1650 }, { "epoch": 0.11487094318732267, "grad_norm": 21.921077416855237, "learning_rate": 4.996660204529654e-07, "logits/chosen": -2.5, "logits/rejected": -2.296875, "logps/chosen": -470.0, "logps/rejected": -752.0, "loss": 0.5179, "rewards/accuracies": 0.90625, "rewards/chosen": -2.921875, "rewards/margins": 2.921875, "rewards/rejected": -5.84375, "step": 1660 }, { "epoch": 0.11556293682098125, "grad_norm": 22.09554416031629, "learning_rate": 4.996340859287876e-07, "logits/chosen": -2.40625, "logits/rejected": -2.265625, "logps/chosen": -474.0, "logps/rejected": -716.0, "loss": 0.5175, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.15625, "rewards/margins": 2.546875, "rewards/rejected": -5.6875, "step": 1670 }, { "epoch": 0.11625493045463982, "grad_norm": 19.69752347565679, "learning_rate": 4.996006946623932e-07, "logits/chosen": -2.328125, "logits/rejected": -2.15625, "logps/chosen": -476.0, "logps/rejected": -740.0, "loss": 0.489, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.84375, "rewards/margins": 2.890625, "rewards/rejected": -5.71875, "step": 1680 }, { "epoch": 0.11694692408829839, "grad_norm": 23.782635056267036, "learning_rate": 4.995658468486375e-07, "logits/chosen": -2.484375, "logits/rejected": -2.25, "logps/chosen": -476.0, "logps/rejected": -728.0, "loss": 0.49, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -2.953125, "rewards/margins": 2.703125, "rewards/rejected": -5.65625, "step": 1690 }, { "epoch": 0.11763891772195696, "grad_norm": 21.333085842607485, "learning_rate": 4.995295426908749e-07, "logits/chosen": -2.421875, "logits/rejected": -2.28125, "logps/chosen": -488.0, "logps/rejected": -732.0, "loss": 0.5472, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.21875, "rewards/margins": 2.515625, "rewards/rejected": -5.75, "step": 1700 }, { "epoch": 0.11833091135561553, "grad_norm": 25.393309605139915, "learning_rate": 4.994917824009589e-07, "logits/chosen": -2.609375, "logits/rejected": -2.421875, "logps/chosen": -510.0, "logps/rejected": -752.0, "loss": 0.4922, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.140625, "rewards/margins": 2.609375, "rewards/rejected": -5.75, "step": 1710 }, { "epoch": 0.11902290498927409, "grad_norm": 21.463394766386354, "learning_rate": 4.994525661992401e-07, "logits/chosen": -2.640625, "logits/rejected": -2.296875, "logps/chosen": -478.0, "logps/rejected": -820.0, "loss": 0.4892, "rewards/accuracies": 0.9375, "rewards/chosen": -2.921875, "rewards/margins": 3.5625, "rewards/rejected": -6.46875, "step": 1720 }, { "epoch": 0.11971489862293266, "grad_norm": 23.66570733954187, "learning_rate": 4.994118943145648e-07, "logits/chosen": -2.5625, "logits/rejected": -2.40625, "logps/chosen": -564.0, "logps/rejected": -876.0, "loss": 0.5589, "rewards/accuracies": 0.90625, "rewards/chosen": -3.953125, "rewards/margins": 2.96875, "rewards/rejected": -6.9375, "step": 1730 }, { "epoch": 0.12040689225659124, "grad_norm": 17.661208093349465, "learning_rate": 4.993697669842746e-07, "logits/chosen": -2.5625, "logits/rejected": -2.421875, "logps/chosen": -532.0, "logps/rejected": -772.0, "loss": 0.5119, "rewards/accuracies": 0.90625, "rewards/chosen": -3.390625, "rewards/margins": 2.71875, "rewards/rejected": -6.125, "step": 1740 }, { "epoch": 0.12109888589024981, "grad_norm": 28.12588386217, "learning_rate": 4.993261844542036e-07, "logits/chosen": -2.546875, "logits/rejected": -2.421875, "logps/chosen": -524.0, "logps/rejected": -812.0, "loss": 0.4881, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.53125, "rewards/margins": 3.234375, "rewards/rejected": -6.75, "step": 1750 }, { "epoch": 0.12179087952390838, "grad_norm": 23.0474180404373, "learning_rate": 4.992811469786782e-07, "logits/chosen": -2.453125, "logits/rejected": -2.25, "logps/chosen": -496.0, "logps/rejected": -780.0, "loss": 0.4937, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.15625, "rewards/margins": 3.015625, "rewards/rejected": -6.1875, "step": 1760 }, { "epoch": 0.12248287315756695, "grad_norm": 39.848413938347214, "learning_rate": 4.99234654820515e-07, "logits/chosen": -2.53125, "logits/rejected": -2.3125, "logps/chosen": -512.0, "logps/rejected": -780.0, "loss": 0.521, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.40625, "rewards/margins": 2.625, "rewards/rejected": -6.03125, "step": 1770 }, { "epoch": 0.12317486679122552, "grad_norm": 18.878512079147985, "learning_rate": 4.991867082510196e-07, "logits/chosen": -2.46875, "logits/rejected": -2.328125, "logps/chosen": -502.0, "logps/rejected": -728.0, "loss": 0.4753, "rewards/accuracies": 0.90625, "rewards/chosen": -3.25, "rewards/margins": 2.609375, "rewards/rejected": -5.875, "step": 1780 }, { "epoch": 0.1238668604248841, "grad_norm": 20.58266267535254, "learning_rate": 4.991373075499846e-07, "logits/chosen": -2.515625, "logits/rejected": -2.46875, "logps/chosen": -516.0, "logps/rejected": -764.0, "loss": 0.4918, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.4375, "rewards/margins": 2.734375, "rewards/rejected": -6.1875, "step": 1790 }, { "epoch": 0.12455885405854267, "grad_norm": 17.16761005362731, "learning_rate": 4.990864530056881e-07, "logits/chosen": -2.453125, "logits/rejected": -2.21875, "logps/chosen": -516.0, "logps/rejected": -804.0, "loss": 0.4914, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.375, "rewards/margins": 2.84375, "rewards/rejected": -6.21875, "step": 1800 }, { "epoch": 0.12525084769220124, "grad_norm": 21.041492996940537, "learning_rate": 4.990341449148927e-07, "logits/chosen": -2.53125, "logits/rejected": -2.359375, "logps/chosen": -504.0, "logps/rejected": -772.0, "loss": 0.4909, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.203125, "rewards/margins": 2.765625, "rewards/rejected": -5.96875, "step": 1810 }, { "epoch": 0.1259428413258598, "grad_norm": 23.218222435468242, "learning_rate": 4.989803835828425e-07, "logits/chosen": -2.59375, "logits/rejected": -2.46875, "logps/chosen": -512.0, "logps/rejected": -824.0, "loss": 0.479, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.5, "rewards/margins": 2.953125, "rewards/rejected": -6.46875, "step": 1820 }, { "epoch": 0.12663483495951838, "grad_norm": 20.76590182596264, "learning_rate": 4.989251693232623e-07, "logits/chosen": -2.53125, "logits/rejected": -2.421875, "logps/chosen": -524.0, "logps/rejected": -812.0, "loss": 0.4679, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.59375, "rewards/margins": 3.03125, "rewards/rejected": -6.625, "step": 1830 }, { "epoch": 0.12732682859317696, "grad_norm": 23.48451287055334, "learning_rate": 4.988685024583557e-07, "logits/chosen": -2.578125, "logits/rejected": -2.515625, "logps/chosen": -496.0, "logps/rejected": -752.0, "loss": 0.5139, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.3125, "rewards/margins": 2.671875, "rewards/rejected": -5.96875, "step": 1840 }, { "epoch": 0.12801882222683553, "grad_norm": 18.533592876193353, "learning_rate": 4.988103833188024e-07, "logits/chosen": -2.5625, "logits/rejected": -2.515625, "logps/chosen": -536.0, "logps/rejected": -760.0, "loss": 0.4784, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.53125, "rewards/margins": 2.53125, "rewards/rejected": -6.0625, "step": 1850 }, { "epoch": 0.12871081586049407, "grad_norm": 22.75170038766669, "learning_rate": 4.987508122437575e-07, "logits/chosen": -2.484375, "logits/rejected": -2.375, "logps/chosen": -516.0, "logps/rejected": -812.0, "loss": 0.4808, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.40625, "rewards/margins": 3.109375, "rewards/rejected": -6.53125, "step": 1860 }, { "epoch": 0.12940280949415264, "grad_norm": 24.44224192998829, "learning_rate": 4.986897895808485e-07, "logits/chosen": -2.59375, "logits/rejected": -2.46875, "logps/chosen": -496.0, "logps/rejected": -748.0, "loss": 0.4513, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.234375, "rewards/margins": 2.75, "rewards/rejected": -5.96875, "step": 1870 }, { "epoch": 0.13009480312781121, "grad_norm": 19.329296217243634, "learning_rate": 4.986273156861738e-07, "logits/chosen": -2.546875, "logits/rejected": -2.3125, "logps/chosen": -548.0, "logps/rejected": -784.0, "loss": 0.5148, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.890625, "rewards/margins": 2.421875, "rewards/rejected": -6.3125, "step": 1880 }, { "epoch": 0.1307867967614698, "grad_norm": 20.237945155766234, "learning_rate": 4.985633909243004e-07, "logits/chosen": -2.53125, "logits/rejected": -2.34375, "logps/chosen": -556.0, "logps/rejected": -800.0, "loss": 0.468, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.75, "rewards/margins": 2.65625, "rewards/rejected": -6.40625, "step": 1890 }, { "epoch": 0.13147879039512836, "grad_norm": 23.302378821737662, "learning_rate": 4.98498015668262e-07, "logits/chosen": -2.484375, "logits/rejected": -2.375, "logps/chosen": -508.0, "logps/rejected": -788.0, "loss": 0.4737, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.375, "rewards/margins": 2.890625, "rewards/rejected": -6.25, "step": 1900 }, { "epoch": 0.13217078402878693, "grad_norm": 23.94384271266701, "learning_rate": 4.984311902995564e-07, "logits/chosen": -2.5, "logits/rejected": -2.34375, "logps/chosen": -548.0, "logps/rejected": -852.0, "loss": 0.4715, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.78125, "rewards/margins": 2.984375, "rewards/rejected": -6.75, "step": 1910 }, { "epoch": 0.1328627776624455, "grad_norm": 24.3875082739124, "learning_rate": 4.983629152081439e-07, "logits/chosen": -2.53125, "logits/rejected": -2.53125, "logps/chosen": -504.0, "logps/rejected": -804.0, "loss": 0.4812, "rewards/accuracies": 0.9375, "rewards/chosen": -3.484375, "rewards/margins": 2.984375, "rewards/rejected": -6.46875, "step": 1920 }, { "epoch": 0.13355477129610407, "grad_norm": 28.108233009749767, "learning_rate": 4.982931907924442e-07, "logits/chosen": -2.640625, "logits/rejected": -2.5, "logps/chosen": -532.0, "logps/rejected": -816.0, "loss": 0.4776, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.546875, "rewards/margins": 3.046875, "rewards/rejected": -6.59375, "step": 1930 }, { "epoch": 0.13424676492976265, "grad_norm": 20.347594705911213, "learning_rate": 4.98222017459335e-07, "logits/chosen": -2.578125, "logits/rejected": -2.578125, "logps/chosen": -528.0, "logps/rejected": -872.0, "loss": 0.4444, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.46875, "rewards/margins": 3.28125, "rewards/rejected": -6.75, "step": 1940 }, { "epoch": 0.13493875856342122, "grad_norm": 22.22885522260826, "learning_rate": 4.981493956241491e-07, "logits/chosen": -2.640625, "logits/rejected": -2.484375, "logps/chosen": -524.0, "logps/rejected": -784.0, "loss": 0.4842, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.5, "rewards/margins": 2.703125, "rewards/rejected": -6.21875, "step": 1950 }, { "epoch": 0.1356307521970798, "grad_norm": 20.907880907284838, "learning_rate": 4.980753257106719e-07, "logits/chosen": -2.578125, "logits/rejected": -2.6875, "logps/chosen": -504.0, "logps/rejected": -792.0, "loss": 0.4384, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.421875, "rewards/margins": 2.921875, "rewards/rejected": -6.34375, "step": 1960 }, { "epoch": 0.13632274583073836, "grad_norm": 20.45747217752141, "learning_rate": 4.979998081511389e-07, "logits/chosen": -2.5625, "logits/rejected": -2.59375, "logps/chosen": -512.0, "logps/rejected": -800.0, "loss": 0.4733, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.3125, "rewards/margins": 3.140625, "rewards/rejected": -6.4375, "step": 1970 }, { "epoch": 0.13701473946439693, "grad_norm": 20.13973497229635, "learning_rate": 4.979228433862339e-07, "logits/chosen": -2.578125, "logits/rejected": -2.40625, "logps/chosen": -536.0, "logps/rejected": -892.0, "loss": 0.4227, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.484375, "rewards/margins": 3.203125, "rewards/rejected": -6.6875, "step": 1980 }, { "epoch": 0.1377067330980555, "grad_norm": 23.04944858303902, "learning_rate": 4.978444318650854e-07, "logits/chosen": -2.609375, "logits/rejected": -2.359375, "logps/chosen": -548.0, "logps/rejected": -824.0, "loss": 0.4821, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.65625, "rewards/margins": 2.9375, "rewards/rejected": -6.59375, "step": 1990 }, { "epoch": 0.13839872673171408, "grad_norm": 22.60565052612306, "learning_rate": 4.977645740452645e-07, "logits/chosen": -2.5625, "logits/rejected": -2.53125, "logps/chosen": -524.0, "logps/rejected": -788.0, "loss": 0.5161, "rewards/accuracies": 0.90625, "rewards/chosen": -3.484375, "rewards/margins": 2.671875, "rewards/rejected": -6.15625, "step": 2000 }, { "epoch": 0.13839872673171408, "eval_logits/chosen": -2.578125, "eval_logits/rejected": -2.484375, "eval_logps/chosen": -544.0, "eval_logps/rejected": -784.0, "eval_loss": 0.26600268483161926, "eval_rewards/accuracies": 0.8813767433166504, "eval_rewards/chosen": -3.53125, "eval_rewards/margins": 2.6875, "eval_rewards/rejected": -6.21875, "eval_runtime": 2936.0951, "eval_samples_per_second": 33.321, "eval_steps_per_second": 0.521, "step": 2000 }, { "epoch": 0.13909072036537265, "grad_norm": 22.927060614599938, "learning_rate": 4.976832703927826e-07, "logits/chosen": -2.609375, "logits/rejected": -2.421875, "logps/chosen": -544.0, "logps/rejected": -852.0, "loss": 0.4652, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.671875, "rewards/margins": 3.125, "rewards/rejected": -6.8125, "step": 2010 }, { "epoch": 0.13978271399903122, "grad_norm": 20.169689850864096, "learning_rate": 4.976005213820878e-07, "logits/chosen": -2.625, "logits/rejected": -2.53125, "logps/chosen": -524.0, "logps/rejected": -836.0, "loss": 0.4369, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.578125, "rewards/margins": 3.09375, "rewards/rejected": -6.65625, "step": 2020 }, { "epoch": 0.14047470763268977, "grad_norm": 28.8089147997869, "learning_rate": 4.975163274960627e-07, "logits/chosen": -2.5625, "logits/rejected": -2.515625, "logps/chosen": -520.0, "logps/rejected": -832.0, "loss": 0.3987, "rewards/accuracies": 0.9375, "rewards/chosen": -3.359375, "rewards/margins": 3.171875, "rewards/rejected": -6.53125, "step": 2030 }, { "epoch": 0.14116670126634834, "grad_norm": 27.33130954134726, "learning_rate": 4.974306892260217e-07, "logits/chosen": -2.515625, "logits/rejected": -2.4375, "logps/chosen": -544.0, "logps/rejected": -828.0, "loss": 0.4757, "rewards/accuracies": 0.90625, "rewards/chosen": -3.6875, "rewards/margins": 3.03125, "rewards/rejected": -6.71875, "step": 2040 }, { "epoch": 0.1418586949000069, "grad_norm": 18.8820676407958, "learning_rate": 4.973436070717078e-07, "logits/chosen": -2.625, "logits/rejected": -2.5625, "logps/chosen": -524.0, "logps/rejected": -780.0, "loss": 0.4353, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.453125, "rewards/margins": 2.625, "rewards/rejected": -6.0625, "step": 2050 }, { "epoch": 0.14255068853366548, "grad_norm": 22.517171012080873, "learning_rate": 4.972550815412896e-07, "logits/chosen": -2.65625, "logits/rejected": -2.5625, "logps/chosen": -560.0, "logps/rejected": -860.0, "loss": 0.4982, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.59375, "rewards/margins": 3.25, "rewards/rejected": -6.84375, "step": 2060 }, { "epoch": 0.14324268216732405, "grad_norm": 28.07753977261864, "learning_rate": 4.97165113151359e-07, "logits/chosen": -2.5, "logits/rejected": -2.5625, "logps/chosen": -524.0, "logps/rejected": -816.0, "loss": 0.4497, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.5625, "rewards/margins": 3.046875, "rewards/rejected": -6.59375, "step": 2070 }, { "epoch": 0.14393467580098263, "grad_norm": 32.18866500240505, "learning_rate": 4.970737024269273e-07, "logits/chosen": -2.609375, "logits/rejected": -2.65625, "logps/chosen": -520.0, "logps/rejected": -796.0, "loss": 0.4644, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.28125, "rewards/margins": 3.078125, "rewards/rejected": -6.34375, "step": 2080 }, { "epoch": 0.1446266694346412, "grad_norm": 27.176688050191203, "learning_rate": 4.969808499014227e-07, "logits/chosen": -2.703125, "logits/rejected": -2.703125, "logps/chosen": -506.0, "logps/rejected": -808.0, "loss": 0.4326, "rewards/accuracies": 0.90625, "rewards/chosen": -3.421875, "rewards/margins": 3.125, "rewards/rejected": -6.53125, "step": 2090 }, { "epoch": 0.14531866306829977, "grad_norm": 20.590746081420207, "learning_rate": 4.96886556116687e-07, "logits/chosen": -2.546875, "logits/rejected": -2.46875, "logps/chosen": -516.0, "logps/rejected": -744.0, "loss": 0.4606, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.25, "rewards/margins": 2.640625, "rewards/rejected": -5.90625, "step": 2100 }, { "epoch": 0.14601065670195834, "grad_norm": 84.88843482487788, "learning_rate": 4.967908216229727e-07, "logits/chosen": -2.609375, "logits/rejected": -2.421875, "logps/chosen": -490.0, "logps/rejected": -792.0, "loss": 0.4663, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.140625, "rewards/margins": 3.015625, "rewards/rejected": -6.15625, "step": 2110 }, { "epoch": 0.1467026503356169, "grad_norm": 26.722950562915496, "learning_rate": 4.966936469789391e-07, "logits/chosen": -2.453125, "logits/rejected": -2.453125, "logps/chosen": -536.0, "logps/rejected": -792.0, "loss": 0.4284, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.546875, "rewards/margins": 2.828125, "rewards/rejected": -6.375, "step": 2120 }, { "epoch": 0.14739464396927549, "grad_norm": 25.72661142134039, "learning_rate": 4.965950327516502e-07, "logits/chosen": -2.5, "logits/rejected": -2.4375, "logps/chosen": -512.0, "logps/rejected": -820.0, "loss": 0.4853, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.3125, "rewards/margins": 3.0625, "rewards/rejected": -6.375, "step": 2130 }, { "epoch": 0.14808663760293406, "grad_norm": 20.38273596331589, "learning_rate": 4.9649497951657e-07, "logits/chosen": -2.53125, "logits/rejected": -2.484375, "logps/chosen": -524.0, "logps/rejected": -860.0, "loss": 0.4375, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.453125, "rewards/margins": 3.453125, "rewards/rejected": -6.9375, "step": 2140 }, { "epoch": 0.14877863123659263, "grad_norm": 17.916949995644135, "learning_rate": 4.963934878575603e-07, "logits/chosen": -2.578125, "logits/rejected": -2.46875, "logps/chosen": -478.0, "logps/rejected": -816.0, "loss": 0.4553, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.96875, "rewards/margins": 3.34375, "rewards/rejected": -6.3125, "step": 2150 }, { "epoch": 0.1494706248702512, "grad_norm": 16.404152188596182, "learning_rate": 4.962905583668766e-07, "logits/chosen": -2.671875, "logits/rejected": -2.5625, "logps/chosen": -540.0, "logps/rejected": -824.0, "loss": 0.4137, "rewards/accuracies": 0.875, "rewards/chosen": -3.703125, "rewards/margins": 2.78125, "rewards/rejected": -6.46875, "step": 2160 }, { "epoch": 0.15016261850390977, "grad_norm": 18.32648742757125, "learning_rate": 4.961861916451651e-07, "logits/chosen": -2.734375, "logits/rejected": -2.65625, "logps/chosen": -510.0, "logps/rejected": -788.0, "loss": 0.4519, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.390625, "rewards/margins": 3.015625, "rewards/rejected": -6.40625, "step": 2170 }, { "epoch": 0.15085461213756834, "grad_norm": 26.011283503495537, "learning_rate": 4.960803883014587e-07, "logits/chosen": -2.59375, "logits/rejected": -2.578125, "logps/chosen": -494.0, "logps/rejected": -796.0, "loss": 0.4813, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.171875, "rewards/margins": 3.09375, "rewards/rejected": -6.28125, "step": 2180 }, { "epoch": 0.15154660577122692, "grad_norm": 25.66766351181709, "learning_rate": 4.959731489531741e-07, "logits/chosen": -2.578125, "logits/rejected": -2.484375, "logps/chosen": -524.0, "logps/rejected": -844.0, "loss": 0.4589, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.5, "rewards/margins": 3.3125, "rewards/rejected": -6.8125, "step": 2190 }, { "epoch": 0.1522385994048855, "grad_norm": 23.090524776205708, "learning_rate": 4.958644742261074e-07, "logits/chosen": -2.6875, "logits/rejected": -2.46875, "logps/chosen": -502.0, "logps/rejected": -800.0, "loss": 0.4449, "rewards/accuracies": 0.875, "rewards/chosen": -3.328125, "rewards/margins": 3.0, "rewards/rejected": -6.3125, "step": 2200 }, { "epoch": 0.15293059303854403, "grad_norm": 21.95849498472782, "learning_rate": 4.957543647544311e-07, "logits/chosen": -2.703125, "logits/rejected": -2.71875, "logps/chosen": -540.0, "logps/rejected": -812.0, "loss": 0.5008, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.515625, "rewards/margins": 2.890625, "rewards/rejected": -6.40625, "step": 2210 }, { "epoch": 0.1536225866722026, "grad_norm": 28.272887813698684, "learning_rate": 4.956428211806902e-07, "logits/chosen": -2.515625, "logits/rejected": -2.390625, "logps/chosen": -568.0, "logps/rejected": -820.0, "loss": 0.4754, "rewards/accuracies": 0.875, "rewards/chosen": -3.90625, "rewards/margins": 2.703125, "rewards/rejected": -6.625, "step": 2220 }, { "epoch": 0.15431458030586118, "grad_norm": 17.787923929383005, "learning_rate": 4.955298441557983e-07, "logits/chosen": -2.6875, "logits/rejected": -2.453125, "logps/chosen": -492.0, "logps/rejected": -832.0, "loss": 0.4134, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.21875, "rewards/margins": 3.34375, "rewards/rejected": -6.5625, "step": 2230 }, { "epoch": 0.15500657393951975, "grad_norm": 19.08715974652577, "learning_rate": 4.954154343390339e-07, "logits/chosen": -2.6875, "logits/rejected": -2.640625, "logps/chosen": -510.0, "logps/rejected": -808.0, "loss": 0.3895, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.4375, "rewards/margins": 3.0625, "rewards/rejected": -6.5, "step": 2240 }, { "epoch": 0.15569856757317832, "grad_norm": 19.945593432716976, "learning_rate": 4.952995923980367e-07, "logits/chosen": -2.59375, "logits/rejected": -2.421875, "logps/chosen": -528.0, "logps/rejected": -848.0, "loss": 0.397, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.453125, "rewards/margins": 3.28125, "rewards/rejected": -6.71875, "step": 2250 }, { "epoch": 0.1563905612068369, "grad_norm": 19.67029295344873, "learning_rate": 4.951823190088035e-07, "logits/chosen": -2.59375, "logits/rejected": -2.59375, "logps/chosen": -516.0, "logps/rejected": -816.0, "loss": 0.4593, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.34375, "rewards/margins": 3.109375, "rewards/rejected": -6.4375, "step": 2260 }, { "epoch": 0.15708255484049546, "grad_norm": 19.634939071257513, "learning_rate": 4.950636148556844e-07, "logits/chosen": -2.578125, "logits/rejected": -2.5625, "logps/chosen": -560.0, "logps/rejected": -848.0, "loss": 0.4294, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.625, "rewards/margins": 3.0625, "rewards/rejected": -6.6875, "step": 2270 }, { "epoch": 0.15777454847415404, "grad_norm": 19.906786502684835, "learning_rate": 4.949434806313786e-07, "logits/chosen": -2.65625, "logits/rejected": -2.515625, "logps/chosen": -564.0, "logps/rejected": -880.0, "loss": 0.4277, "rewards/accuracies": 0.9375, "rewards/chosen": -3.78125, "rewards/margins": 3.3125, "rewards/rejected": -7.09375, "step": 2280 }, { "epoch": 0.1584665421078126, "grad_norm": 27.190983649133962, "learning_rate": 4.948219170369306e-07, "logits/chosen": -2.625, "logits/rejected": -2.6875, "logps/chosen": -510.0, "logps/rejected": -800.0, "loss": 0.4258, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.453125, "rewards/margins": 3.171875, "rewards/rejected": -6.625, "step": 2290 }, { "epoch": 0.15915853574147118, "grad_norm": 25.242355272138322, "learning_rate": 4.94698924781726e-07, "logits/chosen": -2.671875, "logits/rejected": -2.609375, "logps/chosen": -544.0, "logps/rejected": -824.0, "loss": 0.4278, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.640625, "rewards/margins": 2.90625, "rewards/rejected": -6.53125, "step": 2300 }, { "epoch": 0.15985052937512975, "grad_norm": 21.816345970594174, "learning_rate": 4.945745045834873e-07, "logits/chosen": -2.75, "logits/rejected": -2.546875, "logps/chosen": -556.0, "logps/rejected": -828.0, "loss": 0.4632, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.703125, "rewards/margins": 3.046875, "rewards/rejected": -6.75, "step": 2310 }, { "epoch": 0.16054252300878832, "grad_norm": 25.015416514080613, "learning_rate": 4.944486571682699e-07, "logits/chosen": -2.59375, "logits/rejected": -2.53125, "logps/chosen": -560.0, "logps/rejected": -836.0, "loss": 0.4314, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.859375, "rewards/margins": 2.9375, "rewards/rejected": -6.78125, "step": 2320 }, { "epoch": 0.1612345166424469, "grad_norm": 24.620472275218432, "learning_rate": 4.943213832704574e-07, "logits/chosen": -2.625, "logits/rejected": -2.46875, "logps/chosen": -548.0, "logps/rejected": -836.0, "loss": 0.3891, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.625, "rewards/margins": 3.109375, "rewards/rejected": -6.75, "step": 2330 }, { "epoch": 0.16192651027610547, "grad_norm": 30.356016371979095, "learning_rate": 4.941926836327583e-07, "logits/chosen": -2.5625, "logits/rejected": -2.625, "logps/chosen": -528.0, "logps/rejected": -820.0, "loss": 0.4346, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.390625, "rewards/margins": 3.140625, "rewards/rejected": -6.53125, "step": 2340 }, { "epoch": 0.16261850390976404, "grad_norm": 20.027030729110727, "learning_rate": 4.940625590062003e-07, "logits/chosen": -2.734375, "logits/rejected": -2.546875, "logps/chosen": -588.0, "logps/rejected": -888.0, "loss": 0.4854, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.125, "rewards/margins": 3.09375, "rewards/rejected": -7.21875, "step": 2350 }, { "epoch": 0.1633104975434226, "grad_norm": 22.454950281892526, "learning_rate": 4.939310101501272e-07, "logits/chosen": -2.65625, "logits/rejected": -2.578125, "logps/chosen": -576.0, "logps/rejected": -872.0, "loss": 0.3931, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.96875, "rewards/margins": 3.078125, "rewards/rejected": -7.0625, "step": 2360 }, { "epoch": 0.16400249117708118, "grad_norm": 25.207874011942, "learning_rate": 4.937980378321935e-07, "logits/chosen": -2.5625, "logits/rejected": -2.46875, "logps/chosen": -512.0, "logps/rejected": -804.0, "loss": 0.4303, "rewards/accuracies": 0.90625, "rewards/chosen": -3.453125, "rewards/margins": 3.15625, "rewards/rejected": -6.59375, "step": 2370 }, { "epoch": 0.16469448481073973, "grad_norm": 21.501224231237444, "learning_rate": 4.936636428283605e-07, "logits/chosen": -2.625, "logits/rejected": -2.390625, "logps/chosen": -528.0, "logps/rejected": -864.0, "loss": 0.383, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.5625, "rewards/margins": 3.359375, "rewards/rejected": -6.9375, "step": 2380 }, { "epoch": 0.1653864784443983, "grad_norm": 29.844190117907, "learning_rate": 4.935278259228918e-07, "logits/chosen": -2.59375, "logits/rejected": -2.59375, "logps/chosen": -548.0, "logps/rejected": -880.0, "loss": 0.4674, "rewards/accuracies": 0.9375, "rewards/chosen": -3.671875, "rewards/margins": 3.484375, "rewards/rejected": -7.15625, "step": 2390 }, { "epoch": 0.16607847207805687, "grad_norm": 23.812974807435584, "learning_rate": 4.933905879083481e-07, "logits/chosen": -2.75, "logits/rejected": -2.5625, "logps/chosen": -596.0, "logps/rejected": -908.0, "loss": 0.4208, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.09375, "rewards/margins": 3.25, "rewards/rejected": -7.34375, "step": 2400 }, { "epoch": 0.16677046571171544, "grad_norm": 21.511086510103453, "learning_rate": 4.932519295855832e-07, "logits/chosen": -2.5625, "logits/rejected": -2.53125, "logps/chosen": -548.0, "logps/rejected": -864.0, "loss": 0.4816, "rewards/accuracies": 0.90625, "rewards/chosen": -3.53125, "rewards/margins": 3.34375, "rewards/rejected": -6.875, "step": 2410 }, { "epoch": 0.16746245934537402, "grad_norm": 22.645250832704345, "learning_rate": 4.931118517637394e-07, "logits/chosen": -2.609375, "logits/rejected": -2.53125, "logps/chosen": -500.0, "logps/rejected": -840.0, "loss": 0.3995, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.328125, "rewards/margins": 3.4375, "rewards/rejected": -6.75, "step": 2420 }, { "epoch": 0.1681544529790326, "grad_norm": 30.274342096272335, "learning_rate": 4.929703552602419e-07, "logits/chosen": -2.703125, "logits/rejected": -2.671875, "logps/chosen": -568.0, "logps/rejected": -876.0, "loss": 0.3978, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.78125, "rewards/margins": 3.390625, "rewards/rejected": -7.15625, "step": 2430 }, { "epoch": 0.16884644661269116, "grad_norm": 28.942943344260517, "learning_rate": 4.928274409007952e-07, "logits/chosen": -2.734375, "logits/rejected": -2.671875, "logps/chosen": -556.0, "logps/rejected": -864.0, "loss": 0.4312, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.734375, "rewards/margins": 3.296875, "rewards/rejected": -7.03125, "step": 2440 }, { "epoch": 0.16953844024634973, "grad_norm": 19.893020860290537, "learning_rate": 4.926831095193774e-07, "logits/chosen": -2.59375, "logits/rejected": -2.5625, "logps/chosen": -560.0, "logps/rejected": -844.0, "loss": 0.4511, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.9375, "rewards/margins": 2.890625, "rewards/rejected": -6.8125, "step": 2450 }, { "epoch": 0.1702304338800083, "grad_norm": 26.259121100086258, "learning_rate": 4.925373619582357e-07, "logits/chosen": -2.609375, "logits/rejected": -2.515625, "logps/chosen": -516.0, "logps/rejected": -812.0, "loss": 0.4192, "rewards/accuracies": 0.875, "rewards/chosen": -3.484375, "rewards/margins": 3.125, "rewards/rejected": -6.59375, "step": 2460 }, { "epoch": 0.17092242751366687, "grad_norm": 24.230354060609265, "learning_rate": 4.923901990678815e-07, "logits/chosen": -2.640625, "logits/rejected": -2.53125, "logps/chosen": -552.0, "logps/rejected": -848.0, "loss": 0.4103, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.5625, "rewards/margins": 3.390625, "rewards/rejected": -6.96875, "step": 2470 }, { "epoch": 0.17161442114732545, "grad_norm": 19.778436836297075, "learning_rate": 4.922416217070853e-07, "logits/chosen": -2.65625, "logits/rejected": -2.765625, "logps/chosen": -552.0, "logps/rejected": -880.0, "loss": 0.4205, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.671875, "rewards/margins": 3.5, "rewards/rejected": -7.1875, "step": 2480 }, { "epoch": 0.17230641478098402, "grad_norm": 20.184514413138512, "learning_rate": 4.920916307428719e-07, "logits/chosen": -2.703125, "logits/rejected": -2.5625, "logps/chosen": -552.0, "logps/rejected": -920.0, "loss": 0.4183, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.6875, "rewards/margins": 4.0, "rewards/rejected": -7.6875, "step": 2490 }, { "epoch": 0.1729984084146426, "grad_norm": 19.106690001719695, "learning_rate": 4.91940227050515e-07, "logits/chosen": -2.671875, "logits/rejected": -2.6875, "logps/chosen": -540.0, "logps/rejected": -872.0, "loss": 0.3731, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.703125, "rewards/margins": 3.359375, "rewards/rejected": -7.0625, "step": 2500 }, { "epoch": 0.17369040204830116, "grad_norm": 20.45902540812319, "learning_rate": 4.917874115135322e-07, "logits/chosen": -2.75, "logits/rejected": -2.796875, "logps/chosen": -544.0, "logps/rejected": -868.0, "loss": 0.3513, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.765625, "rewards/margins": 3.390625, "rewards/rejected": -7.15625, "step": 2510 }, { "epoch": 0.17438239568195973, "grad_norm": 19.432497878357456, "learning_rate": 4.916331850236803e-07, "logits/chosen": -2.75, "logits/rejected": -2.625, "logps/chosen": -564.0, "logps/rejected": -864.0, "loss": 0.4276, "rewards/accuracies": 0.875, "rewards/chosen": -3.921875, "rewards/margins": 3.1875, "rewards/rejected": -7.125, "step": 2520 }, { "epoch": 0.1750743893156183, "grad_norm": 26.10968400799072, "learning_rate": 4.914775484809495e-07, "logits/chosen": -2.640625, "logits/rejected": -2.765625, "logps/chosen": -584.0, "logps/rejected": -892.0, "loss": 0.4204, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.96875, "rewards/margins": 3.515625, "rewards/rejected": -7.5, "step": 2530 }, { "epoch": 0.17576638294927688, "grad_norm": 16.232082271442206, "learning_rate": 4.913205027935583e-07, "logits/chosen": -2.75, "logits/rejected": -2.546875, "logps/chosen": -560.0, "logps/rejected": -944.0, "loss": 0.383, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.71875, "rewards/margins": 3.75, "rewards/rejected": -7.46875, "step": 2540 }, { "epoch": 0.17645837658293545, "grad_norm": 24.591536214136095, "learning_rate": 4.911620488779485e-07, "logits/chosen": -2.75, "logits/rejected": -2.78125, "logps/chosen": -576.0, "logps/rejected": -908.0, "loss": 0.4223, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.03125, "rewards/margins": 3.390625, "rewards/rejected": -7.4375, "step": 2550 }, { "epoch": 0.177150370216594, "grad_norm": 28.214074394419384, "learning_rate": 4.910021876587796e-07, "logits/chosen": -2.703125, "logits/rejected": -2.6875, "logps/chosen": -532.0, "logps/rejected": -832.0, "loss": 0.4128, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.75, "rewards/margins": 3.03125, "rewards/rejected": -6.78125, "step": 2560 }, { "epoch": 0.17784236385025257, "grad_norm": 23.16477539414472, "learning_rate": 4.908409200689231e-07, "logits/chosen": -2.8125, "logits/rejected": -2.765625, "logps/chosen": -560.0, "logps/rejected": -896.0, "loss": 0.4236, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.890625, "rewards/margins": 3.40625, "rewards/rejected": -7.3125, "step": 2570 }, { "epoch": 0.17853435748391114, "grad_norm": 20.465737311622817, "learning_rate": 4.906782470494578e-07, "logits/chosen": -2.796875, "logits/rejected": -2.65625, "logps/chosen": -560.0, "logps/rejected": -916.0, "loss": 0.3854, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.921875, "rewards/margins": 3.578125, "rewards/rejected": -7.5, "step": 2580 }, { "epoch": 0.1792263511175697, "grad_norm": 19.78261980220442, "learning_rate": 4.905141695496639e-07, "logits/chosen": -2.671875, "logits/rejected": -2.703125, "logps/chosen": -576.0, "logps/rejected": -912.0, "loss": 0.4303, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.09375, "rewards/margins": 3.359375, "rewards/rejected": -7.46875, "step": 2590 }, { "epoch": 0.17991834475122828, "grad_norm": 24.006933797381745, "learning_rate": 4.903486885270169e-07, "logits/chosen": -2.640625, "logits/rejected": -2.625, "logps/chosen": -572.0, "logps/rejected": -912.0, "loss": 0.3912, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.75, "rewards/margins": 3.625, "rewards/rejected": -7.375, "step": 2600 }, { "epoch": 0.18061033838488685, "grad_norm": 18.84451165851233, "learning_rate": 4.901818049471832e-07, "logits/chosen": -2.6875, "logits/rejected": -2.71875, "logps/chosen": -560.0, "logps/rejected": -908.0, "loss": 0.4054, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -3.515625, "rewards/margins": 3.75, "rewards/rejected": -7.28125, "step": 2610 }, { "epoch": 0.18130233201854543, "grad_norm": 25.851363222163567, "learning_rate": 4.900135197840137e-07, "logits/chosen": -2.71875, "logits/rejected": -2.734375, "logps/chosen": -580.0, "logps/rejected": -924.0, "loss": 0.3945, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.984375, "rewards/margins": 3.640625, "rewards/rejected": -7.625, "step": 2620 }, { "epoch": 0.181994325652204, "grad_norm": 20.65753906867768, "learning_rate": 4.89843834019538e-07, "logits/chosen": -2.609375, "logits/rejected": -2.609375, "logps/chosen": -576.0, "logps/rejected": -916.0, "loss": 0.3816, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.90625, "rewards/margins": 3.78125, "rewards/rejected": -7.6875, "step": 2630 }, { "epoch": 0.18268631928586257, "grad_norm": 32.1721997353203, "learning_rate": 4.896727486439592e-07, "logits/chosen": -2.796875, "logits/rejected": -2.796875, "logps/chosen": -520.0, "logps/rejected": -868.0, "loss": 0.3966, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.734375, "rewards/margins": 3.46875, "rewards/rejected": -7.21875, "step": 2640 }, { "epoch": 0.18337831291952114, "grad_norm": 14.42954863524147, "learning_rate": 4.895002646556477e-07, "logits/chosen": -2.640625, "logits/rejected": -2.65625, "logps/chosen": -510.0, "logps/rejected": -828.0, "loss": 0.4169, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.4375, "rewards/margins": 3.34375, "rewards/rejected": -6.78125, "step": 2650 }, { "epoch": 0.1840703065531797, "grad_norm": 20.72154554100892, "learning_rate": 4.893263830611354e-07, "logits/chosen": -2.59375, "logits/rejected": -2.75, "logps/chosen": -580.0, "logps/rejected": -880.0, "loss": 0.4016, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.84375, "rewards/margins": 3.1875, "rewards/rejected": -7.03125, "step": 2660 }, { "epoch": 0.18476230018683829, "grad_norm": 18.439887146764963, "learning_rate": 4.891511048751102e-07, "logits/chosen": -2.578125, "logits/rejected": -2.53125, "logps/chosen": -592.0, "logps/rejected": -952.0, "loss": 0.3874, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.125, "rewards/margins": 3.796875, "rewards/rejected": -7.9375, "step": 2670 }, { "epoch": 0.18545429382049686, "grad_norm": 23.571624977168113, "learning_rate": 4.889744311204098e-07, "logits/chosen": -2.703125, "logits/rejected": -2.71875, "logps/chosen": -552.0, "logps/rejected": -924.0, "loss": 0.4341, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.875, "rewards/margins": 3.78125, "rewards/rejected": -7.65625, "step": 2680 }, { "epoch": 0.18614628745415543, "grad_norm": 24.988537279184378, "learning_rate": 4.887963628280155e-07, "logits/chosen": -2.875, "logits/rejected": -2.921875, "logps/chosen": -572.0, "logps/rejected": -904.0, "loss": 0.4069, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.09375, "rewards/margins": 3.421875, "rewards/rejected": -7.53125, "step": 2690 }, { "epoch": 0.186838281087814, "grad_norm": 19.437841801814166, "learning_rate": 4.886169010370468e-07, "logits/chosen": -2.671875, "logits/rejected": -2.765625, "logps/chosen": -556.0, "logps/rejected": -848.0, "loss": 0.4269, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.828125, "rewards/margins": 3.03125, "rewards/rejected": -6.875, "step": 2700 }, { "epoch": 0.18753027472147257, "grad_norm": 22.449884056965416, "learning_rate": 4.884360467947546e-07, "logits/chosen": -2.625, "logits/rejected": -2.5625, "logps/chosen": -560.0, "logps/rejected": -916.0, "loss": 0.4377, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.8125, "rewards/margins": 3.5625, "rewards/rejected": -7.375, "step": 2710 }, { "epoch": 0.18822226835513114, "grad_norm": 25.49352695677722, "learning_rate": 4.882538011565158e-07, "logits/chosen": -2.859375, "logits/rejected": -2.796875, "logps/chosen": -524.0, "logps/rejected": -916.0, "loss": 0.404, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.78125, "rewards/margins": 3.625, "rewards/rejected": -7.40625, "step": 2720 }, { "epoch": 0.18891426198878972, "grad_norm": 24.030560296835553, "learning_rate": 4.880701651858266e-07, "logits/chosen": -2.765625, "logits/rejected": -2.765625, "logps/chosen": -588.0, "logps/rejected": -900.0, "loss": 0.4155, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.96875, "rewards/margins": 3.296875, "rewards/rejected": -7.28125, "step": 2730 }, { "epoch": 0.18960625562244826, "grad_norm": 27.160117549872705, "learning_rate": 4.878851399542964e-07, "logits/chosen": -2.734375, "logits/rejected": -2.875, "logps/chosen": -564.0, "logps/rejected": -892.0, "loss": 0.3713, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.875, "rewards/margins": 3.53125, "rewards/rejected": -7.40625, "step": 2740 }, { "epoch": 0.19029824925610683, "grad_norm": 20.81343136830502, "learning_rate": 4.87698726541642e-07, "logits/chosen": -2.796875, "logits/rejected": -2.703125, "logps/chosen": -584.0, "logps/rejected": -908.0, "loss": 0.4181, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.09375, "rewards/margins": 3.25, "rewards/rejected": -7.34375, "step": 2750 }, { "epoch": 0.1909902428897654, "grad_norm": 21.126639717724693, "learning_rate": 4.875109260356808e-07, "logits/chosen": -2.671875, "logits/rejected": -2.78125, "logps/chosen": -556.0, "logps/rejected": -868.0, "loss": 0.3708, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.84375, "rewards/margins": 3.25, "rewards/rejected": -7.09375, "step": 2760 }, { "epoch": 0.19168223652342398, "grad_norm": 21.142674877744284, "learning_rate": 4.873217395323243e-07, "logits/chosen": -2.65625, "logits/rejected": -2.78125, "logps/chosen": -564.0, "logps/rejected": -888.0, "loss": 0.3861, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.828125, "rewards/margins": 3.390625, "rewards/rejected": -7.21875, "step": 2770 }, { "epoch": 0.19237423015708255, "grad_norm": 21.095093459860916, "learning_rate": 4.871311681355724e-07, "logits/chosen": -2.640625, "logits/rejected": -2.734375, "logps/chosen": -524.0, "logps/rejected": -832.0, "loss": 0.3614, "rewards/accuracies": 0.9375, "rewards/chosen": -3.53125, "rewards/margins": 3.296875, "rewards/rejected": -6.84375, "step": 2780 }, { "epoch": 0.19306622379074112, "grad_norm": 26.99061141936349, "learning_rate": 4.869392129575064e-07, "logits/chosen": -2.734375, "logits/rejected": -2.59375, "logps/chosen": -536.0, "logps/rejected": -932.0, "loss": 0.3903, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -3.671875, "rewards/margins": 3.96875, "rewards/rejected": -7.65625, "step": 2790 }, { "epoch": 0.1937582174243997, "grad_norm": 21.02438079816018, "learning_rate": 4.867458751182825e-07, "logits/chosen": -2.671875, "logits/rejected": -2.828125, "logps/chosen": -604.0, "logps/rejected": -872.0, "loss": 0.4162, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.3125, "rewards/margins": 2.953125, "rewards/rejected": -7.25, "step": 2800 }, { "epoch": 0.19445021105805826, "grad_norm": 20.302491358565785, "learning_rate": 4.865511557461258e-07, "logits/chosen": -2.5, "logits/rejected": -2.4375, "logps/chosen": -604.0, "logps/rejected": -916.0, "loss": 0.3843, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.875, "rewards/margins": 3.375, "rewards/rejected": -7.25, "step": 2810 }, { "epoch": 0.19514220469171684, "grad_norm": 24.854033386482925, "learning_rate": 4.863550559773232e-07, "logits/chosen": -2.640625, "logits/rejected": -2.640625, "logps/chosen": -552.0, "logps/rejected": -892.0, "loss": 0.3912, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.78125, "rewards/margins": 3.421875, "rewards/rejected": -7.21875, "step": 2820 }, { "epoch": 0.1958341983253754, "grad_norm": 24.676935943778997, "learning_rate": 4.861575769562166e-07, "logits/chosen": -2.640625, "logits/rejected": -2.546875, "logps/chosen": -580.0, "logps/rejected": -932.0, "loss": 0.4585, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.15625, "rewards/margins": 3.5625, "rewards/rejected": -7.71875, "step": 2830 }, { "epoch": 0.19652619195903398, "grad_norm": 20.69403029271643, "learning_rate": 4.85958719835197e-07, "logits/chosen": -2.6875, "logits/rejected": -2.59375, "logps/chosen": -600.0, "logps/rejected": -916.0, "loss": 0.383, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.1875, "rewards/margins": 3.390625, "rewards/rejected": -7.5625, "step": 2840 }, { "epoch": 0.19721818559269255, "grad_norm": 15.441836100157094, "learning_rate": 4.857584857746971e-07, "logits/chosen": -2.75, "logits/rejected": -2.734375, "logps/chosen": -556.0, "logps/rejected": -872.0, "loss": 0.3977, "rewards/accuracies": 0.90625, "rewards/chosen": -3.859375, "rewards/margins": 3.21875, "rewards/rejected": -7.09375, "step": 2850 }, { "epoch": 0.19791017922635112, "grad_norm": 22.136158672529273, "learning_rate": 4.855568759431849e-07, "logits/chosen": -2.6875, "logits/rejected": -2.59375, "logps/chosen": -568.0, "logps/rejected": -940.0, "loss": 0.4079, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.984375, "rewards/margins": 3.453125, "rewards/rejected": -7.4375, "step": 2860 }, { "epoch": 0.1986021728600097, "grad_norm": 19.12982229470724, "learning_rate": 4.853538915171563e-07, "logits/chosen": -2.78125, "logits/rejected": -2.671875, "logps/chosen": -572.0, "logps/rejected": -956.0, "loss": 0.3406, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.03125, "rewards/margins": 3.671875, "rewards/rejected": -7.6875, "step": 2870 }, { "epoch": 0.19929416649366827, "grad_norm": 26.964796746147034, "learning_rate": 4.851495336811291e-07, "logits/chosen": -2.828125, "logits/rejected": -2.78125, "logps/chosen": -564.0, "logps/rejected": -904.0, "loss": 0.3772, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.703125, "rewards/margins": 3.6875, "rewards/rejected": -7.375, "step": 2880 }, { "epoch": 0.19998616012732684, "grad_norm": 20.978766469166096, "learning_rate": 4.849438036276356e-07, "logits/chosen": -2.734375, "logits/rejected": -2.84375, "logps/chosen": -588.0, "logps/rejected": -932.0, "loss": 0.38, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.09375, "rewards/margins": 3.5625, "rewards/rejected": -7.65625, "step": 2890 }, { "epoch": 0.2006781537609854, "grad_norm": 21.83388682620612, "learning_rate": 4.847367025572156e-07, "logits/chosen": -2.859375, "logits/rejected": -2.828125, "logps/chosen": -580.0, "logps/rejected": -904.0, "loss": 0.3795, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.0, "rewards/margins": 3.546875, "rewards/rejected": -7.5625, "step": 2900 }, { "epoch": 0.20137014739464396, "grad_norm": 19.846317038098505, "learning_rate": 4.845282316784093e-07, "logits/chosen": -2.828125, "logits/rejected": -2.796875, "logps/chosen": -596.0, "logps/rejected": -920.0, "loss": 0.4129, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.15625, "rewards/margins": 3.4375, "rewards/rejected": -7.59375, "step": 2910 }, { "epoch": 0.20206214102830253, "grad_norm": 19.47803723640568, "learning_rate": 4.843183922077507e-07, "logits/chosen": -2.921875, "logits/rejected": -3.03125, "logps/chosen": -544.0, "logps/rejected": -888.0, "loss": 0.3867, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.796875, "rewards/margins": 3.625, "rewards/rejected": -7.4375, "step": 2920 }, { "epoch": 0.2027541346619611, "grad_norm": 25.972072256761077, "learning_rate": 4.841071853697601e-07, "logits/chosen": -2.78125, "logits/rejected": -2.875, "logps/chosen": -564.0, "logps/rejected": -916.0, "loss": 0.4171, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.84375, "rewards/margins": 3.640625, "rewards/rejected": -7.5, "step": 2930 }, { "epoch": 0.20344612829561967, "grad_norm": 19.912459523493283, "learning_rate": 4.838946123969373e-07, "logits/chosen": -2.71875, "logits/rejected": -2.75, "logps/chosen": -564.0, "logps/rejected": -948.0, "loss": 0.3296, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.890625, "rewards/margins": 3.734375, "rewards/rejected": -7.625, "step": 2940 }, { "epoch": 0.20413812192927824, "grad_norm": 23.527260807626167, "learning_rate": 4.836806745297539e-07, "logits/chosen": -2.734375, "logits/rejected": -2.859375, "logps/chosen": -564.0, "logps/rejected": -908.0, "loss": 0.3755, "rewards/accuracies": 0.90625, "rewards/chosen": -3.78125, "rewards/margins": 3.609375, "rewards/rejected": -7.375, "step": 2950 }, { "epoch": 0.20483011556293682, "grad_norm": 35.90426892077208, "learning_rate": 4.834653730166464e-07, "logits/chosen": -2.78125, "logits/rejected": -2.75, "logps/chosen": -620.0, "logps/rejected": -932.0, "loss": 0.4044, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.375, "rewards/margins": 3.265625, "rewards/rejected": -7.625, "step": 2960 }, { "epoch": 0.2055221091965954, "grad_norm": 20.109482178235673, "learning_rate": 4.832487091140089e-07, "logits/chosen": -2.734375, "logits/rejected": -2.765625, "logps/chosen": -556.0, "logps/rejected": -888.0, "loss": 0.3764, "rewards/accuracies": 0.9375, "rewards/chosen": -3.875, "rewards/margins": 3.40625, "rewards/rejected": -7.25, "step": 2970 }, { "epoch": 0.20621410283025396, "grad_norm": 25.324377881603645, "learning_rate": 4.830306840861861e-07, "logits/chosen": -2.796875, "logits/rejected": -2.890625, "logps/chosen": -552.0, "logps/rejected": -836.0, "loss": 0.3684, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.671875, "rewards/margins": 3.09375, "rewards/rejected": -6.78125, "step": 2980 }, { "epoch": 0.20690609646391253, "grad_norm": 23.881690765608198, "learning_rate": 4.828112992054649e-07, "logits/chosen": -2.859375, "logits/rejected": -2.703125, "logps/chosen": -604.0, "logps/rejected": -940.0, "loss": 0.4259, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.25, "rewards/margins": 3.4375, "rewards/rejected": -7.65625, "step": 2990 }, { "epoch": 0.2075980900975711, "grad_norm": 21.51624612309427, "learning_rate": 4.82590555752068e-07, "logits/chosen": -2.84375, "logits/rejected": -2.890625, "logps/chosen": -568.0, "logps/rejected": -892.0, "loss": 0.3665, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.890625, "rewards/margins": 3.515625, "rewards/rejected": -7.40625, "step": 3000 }, { "epoch": 0.2075980900975711, "eval_logits/chosen": -2.828125, "eval_logits/rejected": -2.84375, "eval_logps/chosen": -620.0, "eval_logps/rejected": -924.0, "eval_loss": 0.24653171002864838, "eval_rewards/accuracies": 0.8862001299858093, "eval_rewards/chosen": -4.3125, "eval_rewards/margins": 3.28125, "eval_rewards/rejected": -7.59375, "eval_runtime": 2935.4881, "eval_samples_per_second": 33.328, "eval_steps_per_second": 0.521, "step": 3000 }, { "epoch": 0.20829008373122967, "grad_norm": 24.83781333495853, "learning_rate": 4.823684550141464e-07, "logits/chosen": -2.859375, "logits/rejected": -3.03125, "logps/chosen": -600.0, "logps/rejected": -900.0, "loss": 0.3964, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.375, "rewards/margins": 3.140625, "rewards/rejected": -7.5, "step": 3010 }, { "epoch": 0.20898207736488825, "grad_norm": 25.04898398325153, "learning_rate": 4.821449982877707e-07, "logits/chosen": -2.765625, "logits/rejected": -2.90625, "logps/chosen": -588.0, "logps/rejected": -888.0, "loss": 0.3997, "rewards/accuracies": 0.90625, "rewards/chosen": -3.96875, "rewards/margins": 3.140625, "rewards/rejected": -7.125, "step": 3020 }, { "epoch": 0.20967407099854682, "grad_norm": 21.814859232971166, "learning_rate": 4.819201868769252e-07, "logits/chosen": -2.671875, "logits/rejected": -2.765625, "logps/chosen": -612.0, "logps/rejected": -900.0, "loss": 0.3449, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.1875, "rewards/margins": 3.1875, "rewards/rejected": -7.375, "step": 3030 }, { "epoch": 0.2103660646322054, "grad_norm": 22.213279965764144, "learning_rate": 4.816940220934991e-07, "logits/chosen": -2.703125, "logits/rejected": -2.703125, "logps/chosen": -604.0, "logps/rejected": -948.0, "loss": 0.4025, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.1875, "rewards/margins": 3.65625, "rewards/rejected": -7.84375, "step": 3040 }, { "epoch": 0.21105805826586396, "grad_norm": 22.226988079752307, "learning_rate": 4.814665052572792e-07, "logits/chosen": -2.78125, "logits/rejected": -2.734375, "logps/chosen": -632.0, "logps/rejected": -952.0, "loss": 0.4227, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.5, "rewards/margins": 3.15625, "rewards/rejected": -7.65625, "step": 3050 }, { "epoch": 0.21175005189952253, "grad_norm": 17.225860889776754, "learning_rate": 4.812376376959423e-07, "logits/chosen": -2.859375, "logits/rejected": -2.921875, "logps/chosen": -576.0, "logps/rejected": -880.0, "loss": 0.3686, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.0625, "rewards/margins": 3.25, "rewards/rejected": -7.3125, "step": 3060 }, { "epoch": 0.2124420455331811, "grad_norm": 28.316567006429377, "learning_rate": 4.810074207450473e-07, "logits/chosen": -2.8125, "logits/rejected": -2.8125, "logps/chosen": -596.0, "logps/rejected": -972.0, "loss": 0.3634, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.09375, "rewards/margins": 3.78125, "rewards/rejected": -7.875, "step": 3070 }, { "epoch": 0.21313403916683968, "grad_norm": 20.484902319452953, "learning_rate": 4.807758557480276e-07, "logits/chosen": -2.765625, "logits/rejected": -2.78125, "logps/chosen": -584.0, "logps/rejected": -928.0, "loss": 0.3413, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.1875, "rewards/margins": 3.53125, "rewards/rejected": -7.71875, "step": 3080 }, { "epoch": 0.21382603280049822, "grad_norm": 27.094909171722783, "learning_rate": 4.80542944056183e-07, "logits/chosen": -2.78125, "logits/rejected": -2.78125, "logps/chosen": -604.0, "logps/rejected": -996.0, "loss": 0.382, "rewards/accuracies": 0.90625, "rewards/chosen": -4.375, "rewards/margins": 3.75, "rewards/rejected": -8.125, "step": 3090 }, { "epoch": 0.2145180264341568, "grad_norm": 29.83716534057815, "learning_rate": 4.803086870286721e-07, "logits/chosen": -2.75, "logits/rejected": -2.796875, "logps/chosen": -556.0, "logps/rejected": -924.0, "loss": 0.3753, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.75, "rewards/margins": 3.734375, "rewards/rejected": -7.5, "step": 3100 }, { "epoch": 0.21521002006781537, "grad_norm": 32.13780300114932, "learning_rate": 4.800730860325042e-07, "logits/chosen": -2.796875, "logits/rejected": -2.734375, "logps/chosen": -528.0, "logps/rejected": -932.0, "loss": 0.3322, "rewards/accuracies": 0.9375, "rewards/chosen": -3.59375, "rewards/margins": 3.90625, "rewards/rejected": -7.5, "step": 3110 }, { "epoch": 0.21590201370147394, "grad_norm": 29.20318225873319, "learning_rate": 4.798361424425312e-07, "logits/chosen": -2.796875, "logits/rejected": -2.6875, "logps/chosen": -648.0, "logps/rejected": -996.0, "loss": 0.3882, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.625, "rewards/margins": 3.671875, "rewards/rejected": -8.3125, "step": 3120 }, { "epoch": 0.2165940073351325, "grad_norm": 14.71387615574955, "learning_rate": 4.795978576414395e-07, "logits/chosen": -2.71875, "logits/rejected": -3.0, "logps/chosen": -568.0, "logps/rejected": -904.0, "loss": 0.3679, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.84375, "rewards/margins": 3.703125, "rewards/rejected": -7.53125, "step": 3130 }, { "epoch": 0.21728600096879108, "grad_norm": 19.75198709737791, "learning_rate": 4.79358233019743e-07, "logits/chosen": -2.875, "logits/rejected": -3.015625, "logps/chosen": -580.0, "logps/rejected": -948.0, "loss": 0.3373, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.125, "rewards/margins": 3.75, "rewards/rejected": -7.875, "step": 3140 }, { "epoch": 0.21797799460244965, "grad_norm": 23.963242274931837, "learning_rate": 4.791172699757733e-07, "logits/chosen": -2.890625, "logits/rejected": -2.875, "logps/chosen": -624.0, "logps/rejected": -1056.0, "loss": 0.3893, "rewards/accuracies": 0.90625, "rewards/chosen": -4.4375, "rewards/margins": 4.09375, "rewards/rejected": -8.5, "step": 3150 }, { "epoch": 0.21866998823610823, "grad_norm": 22.300802800376424, "learning_rate": 4.788749699156726e-07, "logits/chosen": -2.921875, "logits/rejected": -2.921875, "logps/chosen": -584.0, "logps/rejected": -928.0, "loss": 0.3967, "rewards/accuracies": 0.90625, "rewards/chosen": -4.0625, "rewards/margins": 3.59375, "rewards/rejected": -7.65625, "step": 3160 }, { "epoch": 0.2193619818697668, "grad_norm": 22.36106424389238, "learning_rate": 4.786313342533855e-07, "logits/chosen": -2.78125, "logits/rejected": -2.890625, "logps/chosen": -588.0, "logps/rejected": -952.0, "loss": 0.3853, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.96875, "rewards/margins": 3.796875, "rewards/rejected": -7.75, "step": 3170 }, { "epoch": 0.22005397550342537, "grad_norm": 21.91202023740179, "learning_rate": 4.783863644106502e-07, "logits/chosen": -2.90625, "logits/rejected": -3.078125, "logps/chosen": -572.0, "logps/rejected": -952.0, "loss": 0.3487, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.0625, "rewards/margins": 3.890625, "rewards/rejected": -7.9375, "step": 3180 }, { "epoch": 0.22074596913708394, "grad_norm": 27.545824972477288, "learning_rate": 4.781400618169908e-07, "logits/chosen": -2.859375, "logits/rejected": -2.9375, "logps/chosen": -576.0, "logps/rejected": -948.0, "loss": 0.3466, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.0625, "rewards/margins": 3.84375, "rewards/rejected": -7.90625, "step": 3190 }, { "epoch": 0.2214379627707425, "grad_norm": 18.77761468025167, "learning_rate": 4.778924279097085e-07, "logits/chosen": -2.90625, "logits/rejected": -2.90625, "logps/chosen": -596.0, "logps/rejected": -960.0, "loss": 0.3336, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.34375, "rewards/margins": 3.703125, "rewards/rejected": -8.0625, "step": 3200 }, { "epoch": 0.22212995640440109, "grad_norm": 21.06215390288792, "learning_rate": 4.776434641338735e-07, "logits/chosen": -2.71875, "logits/rejected": -2.78125, "logps/chosen": -576.0, "logps/rejected": -904.0, "loss": 0.3834, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.03125, "rewards/margins": 3.40625, "rewards/rejected": -7.4375, "step": 3210 }, { "epoch": 0.22282195003805966, "grad_norm": 23.861837421157265, "learning_rate": 4.773931719423164e-07, "logits/chosen": -2.8125, "logits/rejected": -2.859375, "logps/chosen": -608.0, "logps/rejected": -956.0, "loss": 0.3443, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.40625, "rewards/margins": 3.546875, "rewards/rejected": -7.9375, "step": 3220 }, { "epoch": 0.22351394367171823, "grad_norm": 21.16096718598134, "learning_rate": 4.771415527956198e-07, "logits/chosen": -2.765625, "logits/rejected": -2.875, "logps/chosen": -604.0, "logps/rejected": -956.0, "loss": 0.3586, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.3125, "rewards/margins": 3.703125, "rewards/rejected": -8.0, "step": 3230 }, { "epoch": 0.2242059373053768, "grad_norm": 40.68941455743844, "learning_rate": 4.768886081621096e-07, "logits/chosen": -2.8125, "logits/rejected": -2.8125, "logps/chosen": -580.0, "logps/rejected": -940.0, "loss": 0.3751, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.09375, "rewards/margins": 3.6875, "rewards/rejected": -7.78125, "step": 3240 }, { "epoch": 0.22489793093903537, "grad_norm": 18.04361520992324, "learning_rate": 4.7663433951784706e-07, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -632.0, "logps/rejected": -900.0, "loss": 0.3999, "rewards/accuracies": 0.875, "rewards/chosen": -4.5625, "rewards/margins": 2.953125, "rewards/rejected": -7.53125, "step": 3250 }, { "epoch": 0.22558992457269395, "grad_norm": 20.716932909544767, "learning_rate": 4.763787483466191e-07, "logits/chosen": -2.703125, "logits/rejected": -2.78125, "logps/chosen": -560.0, "logps/rejected": -896.0, "loss": 0.3431, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.8125, "rewards/margins": 3.453125, "rewards/rejected": -7.28125, "step": 3260 }, { "epoch": 0.2262819182063525, "grad_norm": 18.181832281812582, "learning_rate": 4.761218361399308e-07, "logits/chosen": -2.796875, "logits/rejected": -2.78125, "logps/chosen": -568.0, "logps/rejected": -964.0, "loss": 0.3483, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.96875, "rewards/margins": 3.984375, "rewards/rejected": -7.9375, "step": 3270 }, { "epoch": 0.22697391184001106, "grad_norm": 18.78725558253795, "learning_rate": 4.758636043969958e-07, "logits/chosen": -2.84375, "logits/rejected": -3.0, "logps/chosen": -604.0, "logps/rejected": -956.0, "loss": 0.3813, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.21875, "rewards/margins": 3.640625, "rewards/rejected": -7.875, "step": 3280 }, { "epoch": 0.22766590547366963, "grad_norm": 22.91091622331056, "learning_rate": 4.756040546247281e-07, "logits/chosen": -2.859375, "logits/rejected": -2.953125, "logps/chosen": -580.0, "logps/rejected": -940.0, "loss": 0.3278, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.09375, "rewards/margins": 3.703125, "rewards/rejected": -7.78125, "step": 3290 }, { "epoch": 0.2283578991073282, "grad_norm": 23.70498367169381, "learning_rate": 4.753431883377329e-07, "logits/chosen": -2.5625, "logits/rejected": -2.734375, "logps/chosen": -580.0, "logps/rejected": -968.0, "loss": 0.3834, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.96875, "rewards/margins": 4.03125, "rewards/rejected": -8.0, "step": 3300 }, { "epoch": 0.22904989274098678, "grad_norm": 22.616880824961704, "learning_rate": 4.7508100705829814e-07, "logits/chosen": -2.875, "logits/rejected": -3.078125, "logps/chosen": -608.0, "logps/rejected": -940.0, "loss": 0.3436, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.46875, "rewards/margins": 3.484375, "rewards/rejected": -7.9375, "step": 3310 }, { "epoch": 0.22974188637464535, "grad_norm": 28.91932146350189, "learning_rate": 4.7481751231638527e-07, "logits/chosen": -2.84375, "logits/rejected": -2.765625, "logps/chosen": -608.0, "logps/rejected": -968.0, "loss": 0.3483, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.375, "rewards/margins": 3.765625, "rewards/rejected": -8.125, "step": 3320 }, { "epoch": 0.23043388000830392, "grad_norm": 22.538829330817066, "learning_rate": 4.745527056496206e-07, "logits/chosen": -2.828125, "logits/rejected": -2.921875, "logps/chosen": -588.0, "logps/rejected": -944.0, "loss": 0.3715, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.15625, "rewards/margins": 3.765625, "rewards/rejected": -7.9375, "step": 3330 }, { "epoch": 0.2311258736419625, "grad_norm": 26.46224160472678, "learning_rate": 4.7428658860328583e-07, "logits/chosen": -2.78125, "logits/rejected": -2.875, "logps/chosen": -604.0, "logps/rejected": -944.0, "loss": 0.363, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.3125, "rewards/margins": 3.5625, "rewards/rejected": -7.875, "step": 3340 }, { "epoch": 0.23181786727562106, "grad_norm": 19.74720395282223, "learning_rate": 4.7401916273030994e-07, "logits/chosen": -2.984375, "logits/rejected": -2.984375, "logps/chosen": -596.0, "logps/rejected": -944.0, "loss": 0.4271, "rewards/accuracies": 0.9375, "rewards/chosen": -4.125, "rewards/margins": 3.6875, "rewards/rejected": -7.8125, "step": 3350 }, { "epoch": 0.23250986090927964, "grad_norm": 21.680936207577545, "learning_rate": 4.737504295912592e-07, "logits/chosen": -2.671875, "logits/rejected": -2.75, "logps/chosen": -596.0, "logps/rejected": -928.0, "loss": 0.3647, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.125, "rewards/margins": 3.4375, "rewards/rejected": -7.5625, "step": 3360 }, { "epoch": 0.2332018545429382, "grad_norm": 28.549598455922332, "learning_rate": 4.7348039075432843e-07, "logits/chosen": -2.84375, "logits/rejected": -2.84375, "logps/chosen": -596.0, "logps/rejected": -968.0, "loss": 0.3695, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.34375, "rewards/margins": 3.71875, "rewards/rejected": -8.0625, "step": 3370 }, { "epoch": 0.23389384817659678, "grad_norm": 24.40273493316008, "learning_rate": 4.732090477953322e-07, "logits/chosen": -2.71875, "logits/rejected": -2.609375, "logps/chosen": -620.0, "logps/rejected": -992.0, "loss": 0.3748, "rewards/accuracies": 0.90625, "rewards/chosen": -4.34375, "rewards/margins": 3.84375, "rewards/rejected": -8.1875, "step": 3380 }, { "epoch": 0.23458584181025535, "grad_norm": 19.60584551548484, "learning_rate": 4.7293640229769494e-07, "logits/chosen": -2.734375, "logits/rejected": -2.90625, "logps/chosen": -616.0, "logps/rejected": -928.0, "loss": 0.3877, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.375, "rewards/margins": 3.421875, "rewards/rejected": -7.8125, "step": 3390 }, { "epoch": 0.23527783544391392, "grad_norm": 21.249512761737343, "learning_rate": 4.726624558524421e-07, "logits/chosen": -2.671875, "logits/rejected": -2.984375, "logps/chosen": -608.0, "logps/rejected": -904.0, "loss": 0.3691, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -4.21875, "rewards/margins": 3.296875, "rewards/rejected": -7.53125, "step": 3400 }, { "epoch": 0.2359698290775725, "grad_norm": 34.11057108585263, "learning_rate": 4.723872100581911e-07, "logits/chosen": -2.75, "logits/rejected": -2.84375, "logps/chosen": -612.0, "logps/rejected": -968.0, "loss": 0.3289, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.125, "rewards/margins": 3.828125, "rewards/rejected": -7.96875, "step": 3410 }, { "epoch": 0.23666182271123107, "grad_norm": 26.501091140987835, "learning_rate": 4.7211066652114146e-07, "logits/chosen": -2.8125, "logits/rejected": -2.8125, "logps/chosen": -580.0, "logps/rejected": -976.0, "loss": 0.3385, "rewards/accuracies": 0.9375, "rewards/chosen": -3.984375, "rewards/margins": 4.125, "rewards/rejected": -8.125, "step": 3420 }, { "epoch": 0.23735381634488964, "grad_norm": 26.515431607840945, "learning_rate": 4.718328268550658e-07, "logits/chosen": -2.84375, "logits/rejected": -2.890625, "logps/chosen": -580.0, "logps/rejected": -984.0, "loss": 0.3322, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.15625, "rewards/margins": 4.0625, "rewards/rejected": -8.25, "step": 3430 }, { "epoch": 0.23804580997854818, "grad_norm": 21.098507908958165, "learning_rate": 4.715536926813001e-07, "logits/chosen": -2.734375, "logits/rejected": -2.8125, "logps/chosen": -564.0, "logps/rejected": -996.0, "loss": 0.2977, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.734375, "rewards/margins": 4.5, "rewards/rejected": -8.25, "step": 3440 }, { "epoch": 0.23873780361220676, "grad_norm": 20.306141577227994, "learning_rate": 4.7127326562873487e-07, "logits/chosen": -2.84375, "logits/rejected": -2.796875, "logps/chosen": -612.0, "logps/rejected": -980.0, "loss": 0.3799, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.4375, "rewards/margins": 3.65625, "rewards/rejected": -8.0625, "step": 3450 }, { "epoch": 0.23942979724586533, "grad_norm": 21.31103319204866, "learning_rate": 4.709915473338049e-07, "logits/chosen": -2.796875, "logits/rejected": -2.78125, "logps/chosen": -604.0, "logps/rejected": -964.0, "loss": 0.3608, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.3125, "rewards/margins": 3.71875, "rewards/rejected": -8.0625, "step": 3460 }, { "epoch": 0.2401217908795239, "grad_norm": 19.59399097247858, "learning_rate": 4.7070853944048017e-07, "logits/chosen": -2.703125, "logits/rejected": -2.640625, "logps/chosen": -576.0, "logps/rejected": -972.0, "loss": 0.3199, "rewards/accuracies": 0.9375, "rewards/chosen": -3.890625, "rewards/margins": 3.921875, "rewards/rejected": -7.8125, "step": 3470 }, { "epoch": 0.24081378451318247, "grad_norm": 21.260990143483323, "learning_rate": 4.70424243600256e-07, "logits/chosen": -2.890625, "logits/rejected": -2.734375, "logps/chosen": -632.0, "logps/rejected": -1056.0, "loss": 0.3459, "rewards/accuracies": 0.9375, "rewards/chosen": -4.625, "rewards/margins": 4.21875, "rewards/rejected": -8.8125, "step": 3480 }, { "epoch": 0.24150577814684104, "grad_norm": 29.315243571277037, "learning_rate": 4.701386614721437e-07, "logits/chosen": -2.859375, "logits/rejected": -2.9375, "logps/chosen": -588.0, "logps/rejected": -1012.0, "loss": 0.3546, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.125, "rewards/margins": 4.28125, "rewards/rejected": -8.4375, "step": 3490 }, { "epoch": 0.24219777178049962, "grad_norm": 23.004914465646998, "learning_rate": 4.6985179472266054e-07, "logits/chosen": -2.71875, "logits/rejected": -2.953125, "logps/chosen": -572.0, "logps/rejected": -900.0, "loss": 0.3819, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.734375, "rewards/margins": 3.65625, "rewards/rejected": -7.40625, "step": 3500 }, { "epoch": 0.2428897654141582, "grad_norm": 22.257013314024864, "learning_rate": 4.6956364502582055e-07, "logits/chosen": -2.8125, "logits/rejected": -2.640625, "logps/chosen": -596.0, "logps/rejected": -1012.0, "loss": 0.3341, "rewards/accuracies": 0.9375, "rewards/chosen": -4.3125, "rewards/margins": 4.09375, "rewards/rejected": -8.4375, "step": 3510 }, { "epoch": 0.24358175904781676, "grad_norm": 18.27992865105092, "learning_rate": 4.6927421406312397e-07, "logits/chosen": -2.796875, "logits/rejected": -2.84375, "logps/chosen": -616.0, "logps/rejected": -980.0, "loss": 0.3466, "rewards/accuracies": 0.9375, "rewards/chosen": -4.3125, "rewards/margins": 3.921875, "rewards/rejected": -8.25, "step": 3520 }, { "epoch": 0.24427375268147533, "grad_norm": 25.481104143135887, "learning_rate": 4.689835035235481e-07, "logits/chosen": -2.859375, "logits/rejected": -2.96875, "logps/chosen": -596.0, "logps/rejected": -936.0, "loss": 0.3442, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.40625, "rewards/margins": 3.515625, "rewards/rejected": -7.9375, "step": 3530 }, { "epoch": 0.2449657463151339, "grad_norm": 24.77235233240306, "learning_rate": 4.6869151510353727e-07, "logits/chosen": -2.9375, "logits/rejected": -3.046875, "logps/chosen": -628.0, "logps/rejected": -1004.0, "loss": 0.3574, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.53125, "rewards/margins": 3.84375, "rewards/rejected": -8.375, "step": 3540 }, { "epoch": 0.24565773994879248, "grad_norm": 18.78558105430835, "learning_rate": 4.683982505069929e-07, "logits/chosen": -2.765625, "logits/rejected": -2.8125, "logps/chosen": -592.0, "logps/rejected": -936.0, "loss": 0.3223, "rewards/accuracies": 0.9375, "rewards/chosen": -4.375, "rewards/margins": 3.390625, "rewards/rejected": -7.75, "step": 3550 }, { "epoch": 0.24634973358245105, "grad_norm": 21.673874107725712, "learning_rate": 4.6810371144526333e-07, "logits/chosen": -2.765625, "logits/rejected": -2.875, "logps/chosen": -588.0, "logps/rejected": -920.0, "loss": 0.3429, "rewards/accuracies": 0.875, "rewards/chosen": -4.15625, "rewards/margins": 3.5, "rewards/rejected": -7.65625, "step": 3560 }, { "epoch": 0.24704172721610962, "grad_norm": 21.075978718318, "learning_rate": 4.678078996371343e-07, "logits/chosen": -2.71875, "logits/rejected": -2.796875, "logps/chosen": -628.0, "logps/rejected": -972.0, "loss": 0.348, "rewards/accuracies": 0.9375, "rewards/chosen": -4.4375, "rewards/margins": 3.703125, "rewards/rejected": -8.125, "step": 3570 }, { "epoch": 0.2477337208497682, "grad_norm": 23.522303755082827, "learning_rate": 4.6751081680881866e-07, "logits/chosen": -2.765625, "logits/rejected": -2.8125, "logps/chosen": -628.0, "logps/rejected": -940.0, "loss": 0.3746, "rewards/accuracies": 0.9375, "rewards/chosen": -4.375, "rewards/margins": 3.484375, "rewards/rejected": -7.84375, "step": 3580 }, { "epoch": 0.24842571448342676, "grad_norm": 19.317662197156057, "learning_rate": 4.672124646939462e-07, "logits/chosen": -2.734375, "logits/rejected": -2.84375, "logps/chosen": -612.0, "logps/rejected": -988.0, "loss": 0.3386, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.21875, "rewards/margins": 4.09375, "rewards/rejected": -8.3125, "step": 3590 }, { "epoch": 0.24911770811708533, "grad_norm": 26.271573944115186, "learning_rate": 4.6691284503355366e-07, "logits/chosen": -2.703125, "logits/rejected": -2.78125, "logps/chosen": -604.0, "logps/rejected": -972.0, "loss": 0.3065, "rewards/accuracies": 0.9375, "rewards/chosen": -4.28125, "rewards/margins": 3.765625, "rewards/rejected": -8.0625, "step": 3600 }, { "epoch": 0.2498097017507439, "grad_norm": 25.000107906631513, "learning_rate": 4.6661195957607445e-07, "logits/chosen": -2.859375, "logits/rejected": -2.796875, "logps/chosen": -564.0, "logps/rejected": -940.0, "loss": 0.3289, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.921875, "rewards/margins": 3.875, "rewards/rejected": -7.78125, "step": 3610 }, { "epoch": 0.2505016953844025, "grad_norm": 27.385280037138315, "learning_rate": 4.6630981007732897e-07, "logits/chosen": -2.765625, "logits/rejected": -2.78125, "logps/chosen": -588.0, "logps/rejected": -968.0, "loss": 0.313, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.03125, "rewards/margins": 3.796875, "rewards/rejected": -7.84375, "step": 3620 }, { "epoch": 0.25119368901806105, "grad_norm": 19.400683499080248, "learning_rate": 4.6600639830051335e-07, "logits/chosen": -2.953125, "logits/rejected": -3.046875, "logps/chosen": -584.0, "logps/rejected": -944.0, "loss": 0.3018, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.0, "rewards/margins": 3.75, "rewards/rejected": -7.75, "step": 3630 }, { "epoch": 0.2518856826517196, "grad_norm": 14.177796522489208, "learning_rate": 4.657017260161903e-07, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -580.0, "logps/rejected": -980.0, "loss": 0.3521, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.03125, "rewards/margins": 4.125, "rewards/rejected": -8.125, "step": 3640 }, { "epoch": 0.2525776762853782, "grad_norm": 20.30516271245923, "learning_rate": 4.653957950022779e-07, "logits/chosen": -2.78125, "logits/rejected": -2.78125, "logps/chosen": -544.0, "logps/rejected": -968.0, "loss": 0.3373, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.796875, "rewards/margins": 4.28125, "rewards/rejected": -8.0625, "step": 3650 }, { "epoch": 0.25326966991903677, "grad_norm": 21.2151489582457, "learning_rate": 4.6508860704403965e-07, "logits/chosen": -2.859375, "logits/rejected": -2.890625, "logps/chosen": -572.0, "logps/rejected": -920.0, "loss": 0.3261, "rewards/accuracies": 0.90625, "rewards/chosen": -3.765625, "rewards/margins": 3.734375, "rewards/rejected": -7.5, "step": 3660 }, { "epoch": 0.25396166355269534, "grad_norm": 18.244242193165736, "learning_rate": 4.647801639340739e-07, "logits/chosen": -2.984375, "logits/rejected": -2.984375, "logps/chosen": -632.0, "logps/rejected": -1008.0, "loss": 0.3086, "rewards/accuracies": 0.90625, "rewards/chosen": -4.34375, "rewards/margins": 3.859375, "rewards/rejected": -8.25, "step": 3670 }, { "epoch": 0.2546536571863539, "grad_norm": 12.663785212985172, "learning_rate": 4.644704674723037e-07, "logits/chosen": -2.875, "logits/rejected": -2.953125, "logps/chosen": -640.0, "logps/rejected": -972.0, "loss": 0.3389, "rewards/accuracies": 0.90625, "rewards/chosen": -4.625, "rewards/margins": 3.5, "rewards/rejected": -8.125, "step": 3680 }, { "epoch": 0.2553456508200125, "grad_norm": 19.4935276364047, "learning_rate": 4.6415951946596563e-07, "logits/chosen": -2.859375, "logits/rejected": -2.90625, "logps/chosen": -552.0, "logps/rejected": -928.0, "loss": 0.3274, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.953125, "rewards/margins": 3.78125, "rewards/rejected": -7.71875, "step": 3690 }, { "epoch": 0.25603764445367105, "grad_norm": 19.927136477519664, "learning_rate": 4.638473217296002e-07, "logits/chosen": -2.875, "logits/rejected": -3.015625, "logps/chosen": -600.0, "logps/rejected": -984.0, "loss": 0.3001, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.125, "rewards/margins": 3.953125, "rewards/rejected": -8.0625, "step": 3700 }, { "epoch": 0.25672963808732957, "grad_norm": 23.008946745074788, "learning_rate": 4.6353387608504015e-07, "logits/chosen": -2.9375, "logits/rejected": -2.90625, "logps/chosen": -572.0, "logps/rejected": -952.0, "loss": 0.3443, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.03125, "rewards/margins": 3.625, "rewards/rejected": -7.65625, "step": 3710 }, { "epoch": 0.25742163172098814, "grad_norm": 26.510222543982376, "learning_rate": 4.632191843614008e-07, "logits/chosen": -2.90625, "logits/rejected": -3.03125, "logps/chosen": -568.0, "logps/rejected": -936.0, "loss": 0.346, "rewards/accuracies": 0.9375, "rewards/chosen": -3.96875, "rewards/margins": 3.828125, "rewards/rejected": -7.8125, "step": 3720 }, { "epoch": 0.2581136253546467, "grad_norm": 26.995657975364196, "learning_rate": 4.629032483950689e-07, "logits/chosen": -2.890625, "logits/rejected": -2.890625, "logps/chosen": -596.0, "logps/rejected": -948.0, "loss": 0.3331, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.03125, "rewards/margins": 3.84375, "rewards/rejected": -7.875, "step": 3730 }, { "epoch": 0.2588056189883053, "grad_norm": 27.66837785913489, "learning_rate": 4.6258607002969184e-07, "logits/chosen": -2.8125, "logits/rejected": -2.828125, "logps/chosen": -584.0, "logps/rejected": -964.0, "loss": 0.329, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.09375, "rewards/margins": 3.734375, "rewards/rejected": -7.8125, "step": 3740 }, { "epoch": 0.25949761262196386, "grad_norm": 27.98780316399931, "learning_rate": 4.6226765111616736e-07, "logits/chosen": -2.984375, "logits/rejected": -3.1875, "logps/chosen": -592.0, "logps/rejected": -944.0, "loss": 0.3393, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.15625, "rewards/margins": 3.71875, "rewards/rejected": -7.875, "step": 3750 }, { "epoch": 0.26018960625562243, "grad_norm": 24.273106700324966, "learning_rate": 4.6194799351263207e-07, "logits/chosen": -2.890625, "logits/rejected": -2.953125, "logps/chosen": -648.0, "logps/rejected": -992.0, "loss": 0.3201, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.59375, "rewards/margins": 3.703125, "rewards/rejected": -8.25, "step": 3760 }, { "epoch": 0.260881599889281, "grad_norm": 20.782874115371815, "learning_rate": 4.616270990844512e-07, "logits/chosen": -2.890625, "logits/rejected": -2.984375, "logps/chosen": -624.0, "logps/rejected": -1012.0, "loss": 0.3599, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.46875, "rewards/margins": 3.96875, "rewards/rejected": -8.4375, "step": 3770 }, { "epoch": 0.2615735935229396, "grad_norm": 18.72395684893371, "learning_rate": 4.6130496970420737e-07, "logits/chosen": -2.75, "logits/rejected": -2.90625, "logps/chosen": -624.0, "logps/rejected": -944.0, "loss": 0.3421, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.46875, "rewards/margins": 3.5, "rewards/rejected": -7.96875, "step": 3780 }, { "epoch": 0.26226558715659815, "grad_norm": 17.17403952737608, "learning_rate": 4.609816072516898e-07, "logits/chosen": -2.921875, "logits/rejected": -3.0, "logps/chosen": -624.0, "logps/rejected": -996.0, "loss": 0.3293, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.5, "rewards/margins": 3.84375, "rewards/rejected": -8.3125, "step": 3790 }, { "epoch": 0.2629575807902567, "grad_norm": 23.210010132789414, "learning_rate": 4.6065701361388333e-07, "logits/chosen": -2.859375, "logits/rejected": -2.84375, "logps/chosen": -624.0, "logps/rejected": -984.0, "loss": 0.3372, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.4375, "rewards/margins": 3.6875, "rewards/rejected": -8.125, "step": 3800 }, { "epoch": 0.2636495744239153, "grad_norm": 21.268760334639254, "learning_rate": 4.6033119068495745e-07, "logits/chosen": -2.796875, "logits/rejected": -2.84375, "logps/chosen": -588.0, "logps/rejected": -1004.0, "loss": 0.3537, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.125, "rewards/margins": 4.3125, "rewards/rejected": -8.4375, "step": 3810 }, { "epoch": 0.26434156805757386, "grad_norm": 27.32512207925837, "learning_rate": 4.600041403662551e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0625, "logps/chosen": -648.0, "logps/rejected": -972.0, "loss": 0.3712, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.625, "rewards/margins": 3.5, "rewards/rejected": -8.125, "step": 3820 }, { "epoch": 0.26503356169123243, "grad_norm": 16.928871583404508, "learning_rate": 4.5967586456628174e-07, "logits/chosen": -2.796875, "logits/rejected": -2.828125, "logps/chosen": -612.0, "logps/rejected": -1000.0, "loss": 0.3112, "rewards/accuracies": 0.9375, "rewards/chosen": -4.34375, "rewards/margins": 3.90625, "rewards/rejected": -8.25, "step": 3830 }, { "epoch": 0.265725555324891, "grad_norm": 17.77322843698063, "learning_rate": 4.59346365200694e-07, "logits/chosen": -2.953125, "logits/rejected": -3.09375, "logps/chosen": -600.0, "logps/rejected": -952.0, "loss": 0.3271, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.21875, "rewards/margins": 3.75, "rewards/rejected": -7.96875, "step": 3840 }, { "epoch": 0.2664175489585496, "grad_norm": 24.080718817132457, "learning_rate": 4.590156441922889e-07, "logits/chosen": -2.9375, "logits/rejected": -3.015625, "logps/chosen": -644.0, "logps/rejected": -1056.0, "loss": 0.3529, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.625, "rewards/margins": 4.375, "rewards/rejected": -9.0, "step": 3850 }, { "epoch": 0.26710954259220815, "grad_norm": 31.6507465250174, "learning_rate": 4.5868370347099205e-07, "logits/chosen": -2.890625, "logits/rejected": -2.9375, "logps/chosen": -600.0, "logps/rejected": -1040.0, "loss": 0.3653, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.125, "rewards/margins": 4.375, "rewards/rejected": -8.5, "step": 3860 }, { "epoch": 0.2678015362258667, "grad_norm": 15.670870608200895, "learning_rate": 4.583505449738469e-07, "logits/chosen": -2.9375, "logits/rejected": -3.078125, "logps/chosen": -616.0, "logps/rejected": -1024.0, "loss": 0.3198, "rewards/accuracies": 0.9375, "rewards/chosen": -4.40625, "rewards/margins": 4.1875, "rewards/rejected": -8.5625, "step": 3870 }, { "epoch": 0.2684935298595253, "grad_norm": 18.839791765633844, "learning_rate": 4.5801617064500325e-07, "logits/chosen": -2.828125, "logits/rejected": -2.921875, "logps/chosen": -640.0, "logps/rejected": -1020.0, "loss": 0.3469, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.59375, "rewards/margins": 3.78125, "rewards/rejected": -8.375, "step": 3880 }, { "epoch": 0.26918552349318386, "grad_norm": 26.693065349907965, "learning_rate": 4.5768058243570586e-07, "logits/chosen": -2.8125, "logits/rejected": -2.96875, "logps/chosen": -636.0, "logps/rejected": -1020.0, "loss": 0.3579, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.46875, "rewards/margins": 4.125, "rewards/rejected": -8.5625, "step": 3890 }, { "epoch": 0.26987751712684244, "grad_norm": 23.998142715976797, "learning_rate": 4.5734378230428303e-07, "logits/chosen": -2.921875, "logits/rejected": -2.890625, "logps/chosen": -596.0, "logps/rejected": -984.0, "loss": 0.3488, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.25, "rewards/margins": 3.875, "rewards/rejected": -8.125, "step": 3900 }, { "epoch": 0.270569510760501, "grad_norm": 17.521449488239234, "learning_rate": 4.570057722161354e-07, "logits/chosen": -2.765625, "logits/rejected": -2.78125, "logps/chosen": -588.0, "logps/rejected": -980.0, "loss": 0.3382, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.125, "rewards/margins": 4.0, "rewards/rejected": -8.125, "step": 3910 }, { "epoch": 0.2712615043941596, "grad_norm": 20.567313321608243, "learning_rate": 4.566665541437242e-07, "logits/chosen": -2.8125, "logits/rejected": -2.875, "logps/chosen": -584.0, "logps/rejected": -984.0, "loss": 0.3448, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.0625, "rewards/margins": 4.21875, "rewards/rejected": -8.3125, "step": 3920 }, { "epoch": 0.27195349802781815, "grad_norm": 16.78460084147788, "learning_rate": 4.5632613006655985e-07, "logits/chosen": -2.890625, "logits/rejected": -3.015625, "logps/chosen": -628.0, "logps/rejected": -1016.0, "loss": 0.329, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.375, "rewards/margins": 4.0, "rewards/rejected": -8.375, "step": 3930 }, { "epoch": 0.2726454916614767, "grad_norm": 22.66718959177436, "learning_rate": 4.559845019711905e-07, "logits/chosen": -2.921875, "logits/rejected": -2.921875, "logps/chosen": -532.0, "logps/rejected": -964.0, "loss": 0.3152, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.6875, "rewards/margins": 4.28125, "rewards/rejected": -8.0, "step": 3940 }, { "epoch": 0.2733374852951353, "grad_norm": 22.760214217917625, "learning_rate": 4.556416718511904e-07, "logits/chosen": -2.828125, "logits/rejected": -2.96875, "logps/chosen": -568.0, "logps/rejected": -956.0, "loss": 0.3175, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.921875, "rewards/margins": 3.9375, "rewards/rejected": -7.875, "step": 3950 }, { "epoch": 0.27402947892879387, "grad_norm": 27.00039242897399, "learning_rate": 4.552976417071481e-07, "logits/chosen": -2.84375, "logits/rejected": -2.9375, "logps/chosen": -584.0, "logps/rejected": -988.0, "loss": 0.3017, "rewards/accuracies": 0.9375, "rewards/chosen": -4.0, "rewards/margins": 4.09375, "rewards/rejected": -8.125, "step": 3960 }, { "epoch": 0.27472147256245244, "grad_norm": 21.25327435102768, "learning_rate": 4.5495241354665495e-07, "logits/chosen": -2.984375, "logits/rejected": -3.03125, "logps/chosen": -576.0, "logps/rejected": -948.0, "loss": 0.3067, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.09375, "rewards/margins": 3.875, "rewards/rejected": -7.96875, "step": 3970 }, { "epoch": 0.275413466196111, "grad_norm": 20.042028866461624, "learning_rate": 4.546059893842934e-07, "logits/chosen": -3.0, "logits/rejected": -3.0625, "logps/chosen": -576.0, "logps/rejected": -952.0, "loss": 0.3406, "rewards/accuracies": 0.9375, "rewards/chosen": -4.09375, "rewards/margins": 3.8125, "rewards/rejected": -7.90625, "step": 3980 }, { "epoch": 0.2761054598297696, "grad_norm": 23.23528709240623, "learning_rate": 4.54258371241625e-07, "logits/chosen": -2.9375, "logits/rejected": -3.0, "logps/chosen": -592.0, "logps/rejected": -1020.0, "loss": 0.3464, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.28125, "rewards/margins": 4.28125, "rewards/rejected": -8.5625, "step": 3990 }, { "epoch": 0.27679745346342816, "grad_norm": 17.01072554651658, "learning_rate": 4.5390956114717903e-07, "logits/chosen": -2.84375, "logits/rejected": -2.90625, "logps/chosen": -580.0, "logps/rejected": -988.0, "loss": 0.3082, "rewards/accuracies": 0.9375, "rewards/chosen": -4.21875, "rewards/margins": 4.125, "rewards/rejected": -8.3125, "step": 4000 }, { "epoch": 0.27679745346342816, "eval_logits/chosen": -2.859375, "eval_logits/rejected": -2.96875, "eval_logps/chosen": -624.0, "eval_logps/rejected": -968.0, "eval_loss": 0.24186302721500397, "eval_rewards/accuracies": 0.8920863270759583, "eval_rewards/chosen": -4.34375, "eval_rewards/margins": 3.671875, "eval_rewards/rejected": -8.0625, "eval_runtime": 2937.4933, "eval_samples_per_second": 33.305, "eval_steps_per_second": 0.521, "step": 4000 }, { "epoch": 0.2774894470970867, "grad_norm": 24.192949376472992, "learning_rate": 4.535595611364403e-07, "logits/chosen": -2.78125, "logits/rejected": -3.015625, "logps/chosen": -624.0, "logps/rejected": -948.0, "loss": 0.3043, "rewards/accuracies": 0.90625, "rewards/chosen": -4.28125, "rewards/margins": 3.625, "rewards/rejected": -7.875, "step": 4010 }, { "epoch": 0.2781814407307453, "grad_norm": 23.97524859374442, "learning_rate": 4.5320837325183745e-07, "logits/chosen": -2.859375, "logits/rejected": -2.859375, "logps/chosen": -604.0, "logps/rejected": -1020.0, "loss": 0.3035, "rewards/accuracies": 0.90625, "rewards/chosen": -4.21875, "rewards/margins": 4.28125, "rewards/rejected": -8.5, "step": 4020 }, { "epoch": 0.27887343436440387, "grad_norm": 31.520060656593152, "learning_rate": 4.528559995427308e-07, "logits/chosen": -2.75, "logits/rejected": -2.890625, "logps/chosen": -604.0, "logps/rejected": -1032.0, "loss": 0.3192, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.25, "rewards/margins": 4.375, "rewards/rejected": -8.625, "step": 4030 }, { "epoch": 0.27956542799806244, "grad_norm": 18.155067092348354, "learning_rate": 4.5250244206540066e-07, "logits/chosen": -2.84375, "logits/rejected": -2.828125, "logps/chosen": -600.0, "logps/rejected": -952.0, "loss": 0.3323, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.0625, "rewards/margins": 3.734375, "rewards/rejected": -7.8125, "step": 4040 }, { "epoch": 0.280257421631721, "grad_norm": 24.80173063875789, "learning_rate": 4.521477028830353e-07, "logits/chosen": -2.71875, "logits/rejected": -3.0, "logps/chosen": -552.0, "logps/rejected": -924.0, "loss": 0.3152, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.90625, "rewards/margins": 3.875, "rewards/rejected": -7.78125, "step": 4050 }, { "epoch": 0.28094941526537953, "grad_norm": 28.36263172789394, "learning_rate": 4.5179178406571876e-07, "logits/chosen": -2.875, "logits/rejected": -2.921875, "logps/chosen": -560.0, "logps/rejected": -964.0, "loss": 0.3777, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.921875, "rewards/margins": 4.0625, "rewards/rejected": -7.96875, "step": 4060 }, { "epoch": 0.2816414088990381, "grad_norm": 19.313411524302264, "learning_rate": 4.514346876904189e-07, "logits/chosen": -2.765625, "logits/rejected": -2.828125, "logps/chosen": -616.0, "logps/rejected": -980.0, "loss": 0.31, "rewards/accuracies": 0.9375, "rewards/chosen": -4.3125, "rewards/margins": 3.828125, "rewards/rejected": -8.125, "step": 4070 }, { "epoch": 0.2823334025326967, "grad_norm": 18.337032140536987, "learning_rate": 4.5107641584097513e-07, "logits/chosen": -2.9375, "logits/rejected": -3.046875, "logps/chosen": -636.0, "logps/rejected": -972.0, "loss": 0.3393, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.59375, "rewards/margins": 3.609375, "rewards/rejected": -8.1875, "step": 4080 }, { "epoch": 0.28302539616635525, "grad_norm": 20.24357250319502, "learning_rate": 4.5071697060808656e-07, "logits/chosen": -2.796875, "logits/rejected": -2.734375, "logps/chosen": -580.0, "logps/rejected": -996.0, "loss": 0.2612, "rewards/accuracies": 0.9375, "rewards/chosen": -4.0625, "rewards/margins": 4.0625, "rewards/rejected": -8.125, "step": 4090 }, { "epoch": 0.2837173898000138, "grad_norm": 28.452022142224127, "learning_rate": 4.5035635408929927e-07, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -592.0, "logps/rejected": -944.0, "loss": 0.3654, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.125, "rewards/margins": 3.71875, "rewards/rejected": -7.84375, "step": 4100 }, { "epoch": 0.2844093834336724, "grad_norm": 21.38772076640533, "learning_rate": 4.4999456838899477e-07, "logits/chosen": -2.828125, "logits/rejected": -2.921875, "logps/chosen": -600.0, "logps/rejected": -972.0, "loss": 0.3497, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.25, "rewards/margins": 3.734375, "rewards/rejected": -8.0, "step": 4110 }, { "epoch": 0.28510137706733096, "grad_norm": 19.450333388289756, "learning_rate": 4.4963161561837706e-07, "logits/chosen": -2.796875, "logits/rejected": -2.921875, "logps/chosen": -604.0, "logps/rejected": -976.0, "loss": 0.279, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.1875, "rewards/margins": 3.921875, "rewards/rejected": -8.125, "step": 4120 }, { "epoch": 0.28579337070098954, "grad_norm": 22.837446063966862, "learning_rate": 4.492674978954607e-07, "logits/chosen": -2.796875, "logits/rejected": -2.78125, "logps/chosen": -580.0, "logps/rejected": -972.0, "loss": 0.3363, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.9375, "rewards/margins": 4.15625, "rewards/rejected": -8.0625, "step": 4130 }, { "epoch": 0.2864853643346481, "grad_norm": 23.074291091878013, "learning_rate": 4.489022173450583e-07, "logits/chosen": -2.8125, "logits/rejected": -3.0, "logps/chosen": -568.0, "logps/rejected": -972.0, "loss": 0.2749, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.0, "rewards/margins": 4.15625, "rewards/rejected": -8.1875, "step": 4140 }, { "epoch": 0.2871773579683067, "grad_norm": 25.350519172970056, "learning_rate": 4.485357760987681e-07, "logits/chosen": -2.890625, "logits/rejected": -2.90625, "logps/chosen": -632.0, "logps/rejected": -1072.0, "loss": 0.3149, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.28125, "rewards/margins": 4.40625, "rewards/rejected": -8.6875, "step": 4150 }, { "epoch": 0.28786935160196525, "grad_norm": 23.35542506327524, "learning_rate": 4.4816817629496183e-07, "logits/chosen": -2.84375, "logits/rejected": -3.203125, "logps/chosen": -608.0, "logps/rejected": -972.0, "loss": 0.3296, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.4375, "rewards/margins": 3.84375, "rewards/rejected": -8.3125, "step": 4160 }, { "epoch": 0.2885613452356238, "grad_norm": 21.453665683621974, "learning_rate": 4.4779942007877183e-07, "logits/chosen": -2.828125, "logits/rejected": -2.96875, "logps/chosen": -584.0, "logps/rejected": -976.0, "loss": 0.3328, "rewards/accuracies": 0.9375, "rewards/chosen": -4.15625, "rewards/margins": 3.84375, "rewards/rejected": -8.0, "step": 4170 }, { "epoch": 0.2892533388692824, "grad_norm": 20.2047871385496, "learning_rate": 4.474295096020785e-07, "logits/chosen": -2.84375, "logits/rejected": -3.140625, "logps/chosen": -564.0, "logps/rejected": -948.0, "loss": 0.3251, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.859375, "rewards/margins": 4.125, "rewards/rejected": -8.0, "step": 4180 }, { "epoch": 0.28994533250294097, "grad_norm": 26.286272444741073, "learning_rate": 4.470584470234984e-07, "logits/chosen": -2.96875, "logits/rejected": -3.03125, "logps/chosen": -584.0, "logps/rejected": -976.0, "loss": 0.3525, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.21875, "rewards/margins": 3.984375, "rewards/rejected": -8.1875, "step": 4190 }, { "epoch": 0.29063732613659954, "grad_norm": 24.366602606083344, "learning_rate": 4.466862345083708e-07, "logits/chosen": -2.78125, "logits/rejected": -3.0, "logps/chosen": -608.0, "logps/rejected": -1024.0, "loss": 0.2797, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.25, "rewards/margins": 4.375, "rewards/rejected": -8.625, "step": 4200 }, { "epoch": 0.2913293197702581, "grad_norm": 23.019530435901256, "learning_rate": 4.4631287422874556e-07, "logits/chosen": -2.765625, "logits/rejected": -2.9375, "logps/chosen": -592.0, "logps/rejected": -964.0, "loss": 0.3286, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.03125, "rewards/margins": 4.125, "rewards/rejected": -8.1875, "step": 4210 }, { "epoch": 0.2920213134039167, "grad_norm": 19.076486719495865, "learning_rate": 4.4593836836337045e-07, "logits/chosen": -2.84375, "logits/rejected": -2.90625, "logps/chosen": -580.0, "logps/rejected": -1032.0, "loss": 0.265, "rewards/accuracies": 0.96875, "rewards/chosen": -3.953125, "rewards/margins": 4.78125, "rewards/rejected": -8.75, "step": 4220 }, { "epoch": 0.29271330703757525, "grad_norm": 16.894973787538714, "learning_rate": 4.455627190976781e-07, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -604.0, "logps/rejected": -1040.0, "loss": 0.2719, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.28125, "rewards/margins": 4.46875, "rewards/rejected": -8.75, "step": 4230 }, { "epoch": 0.2934053006712338, "grad_norm": 29.782551228759896, "learning_rate": 4.4518592862377357e-07, "logits/chosen": -2.921875, "logits/rejected": -2.953125, "logps/chosen": -612.0, "logps/rejected": -1032.0, "loss": 0.3104, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.3125, "rewards/margins": 4.34375, "rewards/rejected": -8.625, "step": 4240 }, { "epoch": 0.2940972943048924, "grad_norm": 19.9654019860814, "learning_rate": 4.4480799914042156e-07, "logits/chosen": -2.875, "logits/rejected": -2.78125, "logps/chosen": -600.0, "logps/rejected": -1040.0, "loss": 0.3486, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.25, "rewards/margins": 4.5, "rewards/rejected": -8.75, "step": 4250 }, { "epoch": 0.29478928793855097, "grad_norm": 24.455527809706272, "learning_rate": 4.444289328530333e-07, "logits/chosen": -2.875, "logits/rejected": -2.984375, "logps/chosen": -576.0, "logps/rejected": -952.0, "loss": 0.3086, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.03125, "rewards/margins": 3.796875, "rewards/rejected": -7.84375, "step": 4260 }, { "epoch": 0.29548128157220954, "grad_norm": 16.995365647097422, "learning_rate": 4.4404873197365396e-07, "logits/chosen": -2.734375, "logits/rejected": -2.890625, "logps/chosen": -600.0, "logps/rejected": -1032.0, "loss": 0.3168, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.125, "rewards/margins": 4.40625, "rewards/rejected": -8.5625, "step": 4270 }, { "epoch": 0.2961732752058681, "grad_norm": 31.060320392050073, "learning_rate": 4.436673987209495e-07, "logits/chosen": -2.9375, "logits/rejected": -3.125, "logps/chosen": -608.0, "logps/rejected": -1032.0, "loss": 0.3269, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.28125, "rewards/margins": 4.34375, "rewards/rejected": -8.625, "step": 4280 }, { "epoch": 0.2968652688395267, "grad_norm": 14.217049992167542, "learning_rate": 4.43284935320194e-07, "logits/chosen": -2.875, "logits/rejected": -3.1875, "logps/chosen": -600.0, "logps/rejected": -952.0, "loss": 0.3184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.3125, "rewards/margins": 3.671875, "rewards/rejected": -7.96875, "step": 4290 }, { "epoch": 0.29755726247318526, "grad_norm": 17.488600167454734, "learning_rate": 4.429013440032565e-07, "logits/chosen": -2.890625, "logits/rejected": -2.84375, "logps/chosen": -616.0, "logps/rejected": -1040.0, "loss": 0.3501, "rewards/accuracies": 0.90625, "rewards/chosen": -4.40625, "rewards/margins": 4.28125, "rewards/rejected": -8.6875, "step": 4300 }, { "epoch": 0.29824925610684383, "grad_norm": 12.27921157254326, "learning_rate": 4.425166270085879e-07, "logits/chosen": -2.75, "logits/rejected": -2.90625, "logps/chosen": -552.0, "logps/rejected": -968.0, "loss": 0.2874, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.8125, "rewards/margins": 4.25, "rewards/rejected": -8.0625, "step": 4310 }, { "epoch": 0.2989412497405024, "grad_norm": 24.865791238528022, "learning_rate": 4.421307865812083e-07, "logits/chosen": -2.8125, "logits/rejected": -2.796875, "logps/chosen": -616.0, "logps/rejected": -1000.0, "loss": 0.3218, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.4375, "rewards/margins": 3.859375, "rewards/rejected": -8.3125, "step": 4320 }, { "epoch": 0.299633243374161, "grad_norm": 15.334663075912356, "learning_rate": 4.4174382497269313e-07, "logits/chosen": -2.859375, "logits/rejected": -2.96875, "logps/chosen": -604.0, "logps/rejected": -988.0, "loss": 0.3347, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.28125, "rewards/margins": 4.0, "rewards/rejected": -8.3125, "step": 4330 }, { "epoch": 0.30032523700781955, "grad_norm": 21.451630085432264, "learning_rate": 4.4135574444116085e-07, "logits/chosen": -2.84375, "logits/rejected": -2.859375, "logps/chosen": -604.0, "logps/rejected": -956.0, "loss": 0.3052, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.28125, "rewards/margins": 3.71875, "rewards/rejected": -8.0, "step": 4340 }, { "epoch": 0.3010172306414781, "grad_norm": 22.30629016577841, "learning_rate": 4.409665472512594e-07, "logits/chosen": -2.78125, "logits/rejected": -2.828125, "logps/chosen": -576.0, "logps/rejected": -960.0, "loss": 0.2955, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.0, "rewards/margins": 4.03125, "rewards/rejected": -8.0625, "step": 4350 }, { "epoch": 0.3017092242751367, "grad_norm": 19.682100811891036, "learning_rate": 4.405762356741528e-07, "logits/chosen": -2.84375, "logits/rejected": -2.625, "logps/chosen": -572.0, "logps/rejected": -1104.0, "loss": 0.3325, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.984375, "rewards/margins": 5.03125, "rewards/rejected": -9.0, "step": 4360 }, { "epoch": 0.30240121790879526, "grad_norm": 15.199092180143406, "learning_rate": 4.4018481198750806e-07, "logits/chosen": -2.765625, "logits/rejected": -3.015625, "logps/chosen": -600.0, "logps/rejected": -952.0, "loss": 0.2989, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.15625, "rewards/margins": 3.734375, "rewards/rejected": -7.875, "step": 4370 }, { "epoch": 0.30309321154245383, "grad_norm": 25.14091072004203, "learning_rate": 4.3979227847548226e-07, "logits/chosen": -2.84375, "logits/rejected": -3.0625, "logps/chosen": -612.0, "logps/rejected": -1000.0, "loss": 0.2984, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.25, "rewards/margins": 4.1875, "rewards/rejected": -8.4375, "step": 4380 }, { "epoch": 0.3037852051761124, "grad_norm": 20.216591912274218, "learning_rate": 4.3939863742870853e-07, "logits/chosen": -2.875, "logits/rejected": -2.984375, "logps/chosen": -600.0, "logps/rejected": -944.0, "loss": 0.3359, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.3125, "rewards/margins": 3.59375, "rewards/rejected": -7.9375, "step": 4390 }, { "epoch": 0.304477198809771, "grad_norm": 21.94414234814023, "learning_rate": 4.3900389114428315e-07, "logits/chosen": -2.90625, "logits/rejected": -2.96875, "logps/chosen": -596.0, "logps/rejected": -964.0, "loss": 0.3463, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.1875, "rewards/margins": 3.8125, "rewards/rejected": -8.0, "step": 4400 }, { "epoch": 0.3051691924434295, "grad_norm": 32.25340355187842, "learning_rate": 4.386080419257521e-07, "logits/chosen": -2.78125, "logits/rejected": -2.84375, "logps/chosen": -600.0, "logps/rejected": -1020.0, "loss": 0.319, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.1875, "rewards/margins": 4.15625, "rewards/rejected": -8.375, "step": 4410 }, { "epoch": 0.30586118607708807, "grad_norm": 21.14326562457214, "learning_rate": 4.3821109208309737e-07, "logits/chosen": -2.890625, "logits/rejected": -3.125, "logps/chosen": -624.0, "logps/rejected": -1032.0, "loss": 0.2851, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.53125, "rewards/margins": 4.15625, "rewards/rejected": -8.6875, "step": 4420 }, { "epoch": 0.30655317971074664, "grad_norm": 16.454679135521722, "learning_rate": 4.378130439327238e-07, "logits/chosen": -2.796875, "logits/rejected": -2.875, "logps/chosen": -584.0, "logps/rejected": -968.0, "loss": 0.3334, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.0625, "rewards/margins": 3.9375, "rewards/rejected": -8.0, "step": 4430 }, { "epoch": 0.3072451733444052, "grad_norm": 14.640031809436119, "learning_rate": 4.374138997974454e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0, "logps/chosen": -600.0, "logps/rejected": -1016.0, "loss": 0.3068, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.1875, "rewards/margins": 4.125, "rewards/rejected": -8.3125, "step": 4440 }, { "epoch": 0.3079371669780638, "grad_norm": 21.307074828903335, "learning_rate": 4.370136620064718e-07, "logits/chosen": -2.828125, "logits/rejected": -2.796875, "logps/chosen": -636.0, "logps/rejected": -1016.0, "loss": 0.31, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.71875, "rewards/margins": 3.734375, "rewards/rejected": -8.5, "step": 4450 }, { "epoch": 0.30862916061172235, "grad_norm": 15.633890597966108, "learning_rate": 4.366123328953946e-07, "logits/chosen": -2.875, "logits/rejected": -3.15625, "logps/chosen": -560.0, "logps/rejected": -964.0, "loss": 0.3093, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -3.9375, "rewards/margins": 4.1875, "rewards/rejected": -8.125, "step": 4460 }, { "epoch": 0.3093211542453809, "grad_norm": 18.47647713616181, "learning_rate": 4.3620991480617384e-07, "logits/chosen": -2.84375, "logits/rejected": -3.0, "logps/chosen": -576.0, "logps/rejected": -976.0, "loss": 0.2625, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.09375, "rewards/margins": 3.96875, "rewards/rejected": -8.0625, "step": 4470 }, { "epoch": 0.3100131478790395, "grad_norm": 25.267511388134057, "learning_rate": 4.3580641008712436e-07, "logits/chosen": -2.90625, "logits/rejected": -2.953125, "logps/chosen": -624.0, "logps/rejected": -1040.0, "loss": 0.2885, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.40625, "rewards/margins": 4.375, "rewards/rejected": -8.75, "step": 4480 }, { "epoch": 0.31070514151269807, "grad_norm": 29.466800629428725, "learning_rate": 4.3540182109290203e-07, "logits/chosen": -2.90625, "logits/rejected": -3.078125, "logps/chosen": -584.0, "logps/rejected": -1064.0, "loss": 0.3554, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.09375, "rewards/margins": 4.8125, "rewards/rejected": -8.9375, "step": 4490 }, { "epoch": 0.31139713514635664, "grad_norm": 27.570299576959002, "learning_rate": 4.349961501844899e-07, "logits/chosen": -2.859375, "logits/rejected": -2.828125, "logps/chosen": -640.0, "logps/rejected": -1048.0, "loss": 0.3246, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.5625, "rewards/margins": 4.125, "rewards/rejected": -8.6875, "step": 4500 }, { "epoch": 0.3120891287800152, "grad_norm": 31.201250033193777, "learning_rate": 4.3458939972918474e-07, "logits/chosen": -2.84375, "logits/rejected": -2.796875, "logps/chosen": -616.0, "logps/rejected": -1040.0, "loss": 0.3015, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.4375, "rewards/margins": 4.25, "rewards/rejected": -8.6875, "step": 4510 }, { "epoch": 0.3127811224136738, "grad_norm": 18.63347012554784, "learning_rate": 4.341815721005828e-07, "logits/chosen": -2.78125, "logits/rejected": -2.953125, "logps/chosen": -608.0, "logps/rejected": -972.0, "loss": 0.3419, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.4375, "rewards/margins": 3.78125, "rewards/rejected": -8.25, "step": 4520 }, { "epoch": 0.31347311604733236, "grad_norm": 18.837964806762283, "learning_rate": 4.3377266967856645e-07, "logits/chosen": -2.765625, "logits/rejected": -2.8125, "logps/chosen": -596.0, "logps/rejected": -944.0, "loss": 0.3318, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.1875, "rewards/margins": 3.6875, "rewards/rejected": -7.875, "step": 4530 }, { "epoch": 0.31416510968099093, "grad_norm": 19.114905200454064, "learning_rate": 4.333626948492898e-07, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -600.0, "logps/rejected": -976.0, "loss": 0.3357, "rewards/accuracies": 0.96875, "rewards/chosen": -4.21875, "rewards/margins": 3.96875, "rewards/rejected": -8.1875, "step": 4540 }, { "epoch": 0.3148571033146495, "grad_norm": 18.387847066289982, "learning_rate": 4.3295165000516507e-07, "logits/chosen": -2.8125, "logits/rejected": -2.9375, "logps/chosen": -596.0, "logps/rejected": -1012.0, "loss": 0.3021, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.3125, "rewards/margins": 4.3125, "rewards/rejected": -8.625, "step": 4550 }, { "epoch": 0.31554909694830807, "grad_norm": 13.774759086649077, "learning_rate": 4.3253953754484864e-07, "logits/chosen": -2.75, "logits/rejected": -2.890625, "logps/chosen": -604.0, "logps/rejected": -1004.0, "loss": 0.2772, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.3125, "rewards/margins": 4.15625, "rewards/rejected": -8.5, "step": 4560 }, { "epoch": 0.31624109058196664, "grad_norm": 19.705274837689153, "learning_rate": 4.3212635987322677e-07, "logits/chosen": -2.78125, "logits/rejected": -3.015625, "logps/chosen": -668.0, "logps/rejected": -1048.0, "loss": 0.3506, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.0625, "rewards/rejected": -8.75, "step": 4570 }, { "epoch": 0.3169330842156252, "grad_norm": 26.27921245531654, "learning_rate": 4.3171211940140214e-07, "logits/chosen": -2.796875, "logits/rejected": -2.890625, "logps/chosen": -644.0, "logps/rejected": -1064.0, "loss": 0.2851, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.5625, "rewards/margins": 4.375, "rewards/rejected": -8.9375, "step": 4580 }, { "epoch": 0.3176250778492838, "grad_norm": 39.31441124009804, "learning_rate": 4.31296818546679e-07, "logits/chosen": -2.921875, "logits/rejected": -3.109375, "logps/chosen": -604.0, "logps/rejected": -1040.0, "loss": 0.3175, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.15625, "rewards/margins": 4.5, "rewards/rejected": -8.6875, "step": 4590 }, { "epoch": 0.31831707148294236, "grad_norm": 22.318035588365465, "learning_rate": 4.3088045973254964e-07, "logits/chosen": -2.859375, "logits/rejected": -2.953125, "logps/chosen": -592.0, "logps/rejected": -1032.0, "loss": 0.3421, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.0625, "rewards/margins": 4.59375, "rewards/rejected": -8.6875, "step": 4600 }, { "epoch": 0.31900906511660093, "grad_norm": 22.217218942440027, "learning_rate": 4.3046304538868027e-07, "logits/chosen": -2.796875, "logits/rejected": -2.9375, "logps/chosen": -648.0, "logps/rejected": -968.0, "loss": 0.3212, "rewards/accuracies": 0.90625, "rewards/chosen": -4.53125, "rewards/margins": 3.640625, "rewards/rejected": -8.1875, "step": 4610 }, { "epoch": 0.3197010587502595, "grad_norm": 18.34241793274459, "learning_rate": 4.3004457795089643e-07, "logits/chosen": -2.71875, "logits/rejected": -2.75, "logps/chosen": -564.0, "logps/rejected": -972.0, "loss": 0.3198, "rewards/accuracies": 0.9375, "rewards/chosen": -3.859375, "rewards/margins": 4.125, "rewards/rejected": -8.0, "step": 4620 }, { "epoch": 0.3203930523839181, "grad_norm": 20.012821963808513, "learning_rate": 4.296250598611689e-07, "logits/chosen": -2.671875, "logits/rejected": -2.84375, "logps/chosen": -568.0, "logps/rejected": -956.0, "loss": 0.3279, "rewards/accuracies": 0.90625, "rewards/chosen": -4.0, "rewards/margins": 3.890625, "rewards/rejected": -7.90625, "step": 4630 }, { "epoch": 0.32108504601757665, "grad_norm": 23.698638049727574, "learning_rate": 4.2920449356759995e-07, "logits/chosen": -2.8125, "logits/rejected": -3.0, "logps/chosen": -568.0, "logps/rejected": -936.0, "loss": 0.302, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.890625, "rewards/margins": 3.96875, "rewards/rejected": -7.875, "step": 4640 }, { "epoch": 0.3217770396512352, "grad_norm": 23.78323662287433, "learning_rate": 4.287828815244083e-07, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -596.0, "logps/rejected": -1004.0, "loss": 0.3608, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.34375, "rewards/margins": 4.03125, "rewards/rejected": -8.375, "step": 4650 }, { "epoch": 0.3224690332848938, "grad_norm": 20.860165743978257, "learning_rate": 4.283602261919152e-07, "logits/chosen": -2.96875, "logits/rejected": -3.109375, "logps/chosen": -640.0, "logps/rejected": -1056.0, "loss": 0.2891, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.1875, "rewards/rejected": -9.0, "step": 4660 }, { "epoch": 0.32316102691855236, "grad_norm": 30.015605274223887, "learning_rate": 4.2793653003653017e-07, "logits/chosen": -2.90625, "logits/rejected": -3.046875, "logps/chosen": -600.0, "logps/rejected": -996.0, "loss": 0.3034, "rewards/accuracies": 0.9375, "rewards/chosen": -4.25, "rewards/margins": 4.15625, "rewards/rejected": -8.4375, "step": 4670 }, { "epoch": 0.32385302055221094, "grad_norm": 13.69948855944499, "learning_rate": 4.2751179553073633e-07, "logits/chosen": -2.875, "logits/rejected": -3.046875, "logps/chosen": -552.0, "logps/rejected": -972.0, "loss": 0.3208, "rewards/accuracies": 0.96875, "rewards/chosen": -3.6875, "rewards/margins": 4.25, "rewards/rejected": -7.9375, "step": 4680 }, { "epoch": 0.3245450141858695, "grad_norm": 22.605144407892116, "learning_rate": 4.2708602515307625e-07, "logits/chosen": -2.875, "logits/rejected": -2.921875, "logps/chosen": -556.0, "logps/rejected": -968.0, "loss": 0.3119, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.03125, "rewards/margins": 4.0625, "rewards/rejected": -8.0625, "step": 4690 }, { "epoch": 0.3252370078195281, "grad_norm": 25.677754184798584, "learning_rate": 4.2665922138813714e-07, "logits/chosen": -2.9375, "logits/rejected": -3.0, "logps/chosen": -624.0, "logps/rejected": -1032.0, "loss": 0.2952, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5625, "rewards/margins": 4.15625, "rewards/rejected": -8.6875, "step": 4700 }, { "epoch": 0.32592900145318665, "grad_norm": 23.577921494673735, "learning_rate": 4.262313867265368e-07, "logits/chosen": -2.890625, "logits/rejected": -3.1875, "logps/chosen": -568.0, "logps/rejected": -964.0, "loss": 0.2939, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.15625, "rewards/margins": 4.03125, "rewards/rejected": -8.1875, "step": 4710 }, { "epoch": 0.3266209950868452, "grad_norm": 22.67771356906046, "learning_rate": 4.2580252366490854e-07, "logits/chosen": -2.96875, "logits/rejected": -3.234375, "logps/chosen": -588.0, "logps/rejected": -980.0, "loss": 0.3286, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.15625, "rewards/margins": 4.03125, "rewards/rejected": -8.1875, "step": 4720 }, { "epoch": 0.3273129887205038, "grad_norm": 21.50085389322926, "learning_rate": 4.25372634705887e-07, "logits/chosen": -2.921875, "logits/rejected": -2.828125, "logps/chosen": -608.0, "logps/rejected": -1096.0, "loss": 0.3108, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.375, "rewards/margins": 4.84375, "rewards/rejected": -9.25, "step": 4730 }, { "epoch": 0.32800498235416237, "grad_norm": 21.197041277110767, "learning_rate": 4.249417223580938e-07, "logits/chosen": -2.90625, "logits/rejected": -3.171875, "logps/chosen": -616.0, "logps/rejected": -1032.0, "loss": 0.3157, "rewards/accuracies": 0.9375, "rewards/chosen": -4.375, "rewards/margins": 4.375, "rewards/rejected": -8.75, "step": 4740 }, { "epoch": 0.32869697598782094, "grad_norm": 23.894082720577607, "learning_rate": 4.245097891361221e-07, "logits/chosen": -2.84375, "logits/rejected": -3.078125, "logps/chosen": -552.0, "logps/rejected": -992.0, "loss": 0.298, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.765625, "rewards/margins": 4.4375, "rewards/rejected": -8.1875, "step": 4750 }, { "epoch": 0.32938896962147945, "grad_norm": 23.249963258078793, "learning_rate": 4.2407683756052255e-07, "logits/chosen": -2.765625, "logits/rejected": -2.921875, "logps/chosen": -600.0, "logps/rejected": -1056.0, "loss": 0.2892, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.375, "rewards/margins": 4.4375, "rewards/rejected": -8.8125, "step": 4760 }, { "epoch": 0.330080963255138, "grad_norm": 23.580536553206258, "learning_rate": 4.2364287015778846e-07, "logits/chosen": -2.84375, "logits/rejected": -2.9375, "logps/chosen": -600.0, "logps/rejected": -1016.0, "loss": 0.3445, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.125, "rewards/margins": 4.34375, "rewards/rejected": -8.4375, "step": 4770 }, { "epoch": 0.3307729568887966, "grad_norm": 24.05907214568259, "learning_rate": 4.232078894603409e-07, "logits/chosen": -2.796875, "logits/rejected": -2.90625, "logps/chosen": -612.0, "logps/rejected": -1056.0, "loss": 0.2851, "rewards/accuracies": 0.96875, "rewards/chosen": -4.25, "rewards/margins": 4.375, "rewards/rejected": -8.625, "step": 4780 }, { "epoch": 0.33146495052245517, "grad_norm": 21.623153587185627, "learning_rate": 4.227718980065142e-07, "logits/chosen": -2.921875, "logits/rejected": -2.890625, "logps/chosen": -612.0, "logps/rejected": -1032.0, "loss": 0.3221, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.34375, "rewards/margins": 4.3125, "rewards/rejected": -8.6875, "step": 4790 }, { "epoch": 0.33215694415611374, "grad_norm": 23.804435481011737, "learning_rate": 4.2233489834054084e-07, "logits/chosen": -3.046875, "logits/rejected": -3.21875, "logps/chosen": -604.0, "logps/rejected": -1048.0, "loss": 0.3173, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.4375, "rewards/margins": 4.46875, "rewards/rejected": -8.875, "step": 4800 }, { "epoch": 0.3328489377897723, "grad_norm": 29.79974600700019, "learning_rate": 4.2189689301253666e-07, "logits/chosen": -2.953125, "logits/rejected": -3.0625, "logps/chosen": -636.0, "logps/rejected": -1032.0, "loss": 0.3048, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5625, "rewards/margins": 4.25, "rewards/rejected": -8.8125, "step": 4810 }, { "epoch": 0.3335409314234309, "grad_norm": 22.395983466969405, "learning_rate": 4.214578845784863e-07, "logits/chosen": -2.828125, "logits/rejected": -2.859375, "logps/chosen": -584.0, "logps/rejected": -1040.0, "loss": 0.3266, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.25, "rewards/margins": 4.46875, "rewards/rejected": -8.75, "step": 4820 }, { "epoch": 0.33423292505708946, "grad_norm": 26.741452351744513, "learning_rate": 4.2101787560022784e-07, "logits/chosen": -2.921875, "logits/rejected": -2.859375, "logps/chosen": -580.0, "logps/rejected": -996.0, "loss": 0.3083, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.0625, "rewards/margins": 4.25, "rewards/rejected": -8.3125, "step": 4830 }, { "epoch": 0.33492491869074803, "grad_norm": 26.357577861854804, "learning_rate": 4.2057686864543793e-07, "logits/chosen": -2.921875, "logits/rejected": -3.03125, "logps/chosen": -568.0, "logps/rejected": -944.0, "loss": 0.3235, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.6875, "rewards/margins": 4.0625, "rewards/rejected": -7.75, "step": 4840 }, { "epoch": 0.3356169123244066, "grad_norm": 23.02982633710617, "learning_rate": 4.201348662876174e-07, "logits/chosen": -2.859375, "logits/rejected": -3.015625, "logps/chosen": -576.0, "logps/rejected": -968.0, "loss": 0.2971, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.953125, "rewards/margins": 4.0, "rewards/rejected": -7.9375, "step": 4850 }, { "epoch": 0.3363089059580652, "grad_norm": 15.824807184793825, "learning_rate": 4.196918711060753e-07, "logits/chosen": -2.9375, "logits/rejected": -3.015625, "logps/chosen": -596.0, "logps/rejected": -1032.0, "loss": 0.3142, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.1875, "rewards/margins": 4.4375, "rewards/rejected": -8.6875, "step": 4860 }, { "epoch": 0.33700089959172375, "grad_norm": 17.38817107597906, "learning_rate": 4.1924788568591437e-07, "logits/chosen": -2.84375, "logits/rejected": -2.921875, "logps/chosen": -620.0, "logps/rejected": -1088.0, "loss": 0.287, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.1875, "rewards/margins": 4.875, "rewards/rejected": -9.0625, "step": 4870 }, { "epoch": 0.3376928932253823, "grad_norm": 26.95569105299153, "learning_rate": 4.1880291261801604e-07, "logits/chosen": -2.96875, "logits/rejected": -3.28125, "logps/chosen": -644.0, "logps/rejected": -992.0, "loss": 0.3555, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.59375, "rewards/margins": 3.671875, "rewards/rejected": -8.25, "step": 4880 }, { "epoch": 0.3383848868590409, "grad_norm": 15.942472512290003, "learning_rate": 4.183569544990252e-07, "logits/chosen": -2.875, "logits/rejected": -2.859375, "logps/chosen": -620.0, "logps/rejected": -1048.0, "loss": 0.2996, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.53125, "rewards/margins": 3.984375, "rewards/rejected": -8.5, "step": 4890 }, { "epoch": 0.33907688049269946, "grad_norm": 27.52247497419323, "learning_rate": 4.1791001393133486e-07, "logits/chosen": -2.875, "logits/rejected": -3.046875, "logps/chosen": -632.0, "logps/rejected": -1008.0, "loss": 0.3014, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.375, "rewards/margins": 4.125, "rewards/rejected": -8.5, "step": 4900 }, { "epoch": 0.33976887412635803, "grad_norm": 24.4770698477749, "learning_rate": 4.174620935230713e-07, "logits/chosen": -2.9375, "logits/rejected": -3.203125, "logps/chosen": -592.0, "logps/rejected": -1008.0, "loss": 0.276, "rewards/accuracies": 0.90625, "rewards/chosen": -4.25, "rewards/margins": 4.375, "rewards/rejected": -8.625, "step": 4910 }, { "epoch": 0.3404608677600166, "grad_norm": 29.91091690634603, "learning_rate": 4.1701319588807836e-07, "logits/chosen": -2.9375, "logits/rejected": -3.09375, "logps/chosen": -640.0, "logps/rejected": -1040.0, "loss": 0.3473, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71875, "rewards/margins": 4.0, "rewards/rejected": -8.6875, "step": 4920 }, { "epoch": 0.3411528613936752, "grad_norm": 25.47128166395693, "learning_rate": 4.165633236459028e-07, "logits/chosen": -2.953125, "logits/rejected": -2.984375, "logps/chosen": -632.0, "logps/rejected": -1056.0, "loss": 0.2529, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.59375, "rewards/margins": 4.40625, "rewards/rejected": -9.0, "step": 4930 }, { "epoch": 0.34184485502733375, "grad_norm": 18.79225759625932, "learning_rate": 4.161124794217786e-07, "logits/chosen": -2.890625, "logits/rejected": -2.96875, "logps/chosen": -576.0, "logps/rejected": -936.0, "loss": 0.3064, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.15625, "rewards/margins": 3.703125, "rewards/rejected": -7.84375, "step": 4940 }, { "epoch": 0.3425368486609923, "grad_norm": 19.83380154798566, "learning_rate": 4.156606658466119e-07, "logits/chosen": -2.96875, "logits/rejected": -3.078125, "logps/chosen": -572.0, "logps/rejected": -976.0, "loss": 0.3073, "rewards/accuracies": 0.90625, "rewards/chosen": -4.0625, "rewards/margins": 4.03125, "rewards/rejected": -8.125, "step": 4950 }, { "epoch": 0.3432288422946509, "grad_norm": 19.152386515014815, "learning_rate": 4.1520788555696517e-07, "logits/chosen": -2.96875, "logits/rejected": -3.109375, "logps/chosen": -600.0, "logps/rejected": -1020.0, "loss": 0.2658, "rewards/accuracies": 0.9375, "rewards/chosen": -4.4375, "rewards/margins": 4.1875, "rewards/rejected": -8.625, "step": 4960 }, { "epoch": 0.34392083592830947, "grad_norm": 16.815381072929977, "learning_rate": 4.1475414119504243e-07, "logits/chosen": -2.984375, "logits/rejected": -3.078125, "logps/chosen": -620.0, "logps/rejected": -996.0, "loss": 0.3258, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.6875, "rewards/margins": 3.78125, "rewards/rejected": -8.4375, "step": 4970 }, { "epoch": 0.34461282956196804, "grad_norm": 23.039880631017887, "learning_rate": 4.1429943540867345e-07, "logits/chosen": -2.953125, "logits/rejected": -3.09375, "logps/chosen": -640.0, "logps/rejected": -1056.0, "loss": 0.2888, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.53125, "rewards/margins": 4.375, "rewards/rejected": -8.875, "step": 4980 }, { "epoch": 0.3453048231956266, "grad_norm": 21.475331207310177, "learning_rate": 4.138437708512984e-07, "logits/chosen": -2.9375, "logits/rejected": -3.015625, "logps/chosen": -608.0, "logps/rejected": -1020.0, "loss": 0.2912, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.1875, "rewards/margins": 4.4375, "rewards/rejected": -8.6875, "step": 4990 }, { "epoch": 0.3459968168292852, "grad_norm": 37.906955314514654, "learning_rate": 4.133871501819524e-07, "logits/chosen": -2.921875, "logits/rejected": -3.09375, "logps/chosen": -612.0, "logps/rejected": -1040.0, "loss": 0.2889, "rewards/accuracies": 0.9375, "rewards/chosen": -4.4375, "rewards/margins": 4.3125, "rewards/rejected": -8.75, "step": 5000 }, { "epoch": 0.3459968168292852, "eval_logits/chosen": -2.90625, "eval_logits/rejected": -3.0625, "eval_logps/chosen": -644.0, "eval_logps/rejected": -1004.0, "eval_loss": 0.23485353589057922, "eval_rewards/accuracies": 0.8935579061508179, "eval_rewards/chosen": -4.5625, "eval_rewards/margins": 3.84375, "eval_rewards/rejected": -8.375, "eval_runtime": 2935.095, "eval_samples_per_second": 33.332, "eval_steps_per_second": 0.521, "step": 5000 }, { "epoch": 0.34668881046294375, "grad_norm": 31.213363955374106, "learning_rate": 4.1292957606524994e-07, "logits/chosen": -2.96875, "logits/rejected": -3.09375, "logps/chosen": -612.0, "logps/rejected": -1048.0, "loss": 0.313, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.375, "rewards/margins": 4.53125, "rewards/rejected": -8.875, "step": 5010 }, { "epoch": 0.3473808040966023, "grad_norm": 22.788432042532083, "learning_rate": 4.1247105117136926e-07, "logits/chosen": -2.75, "logits/rejected": -2.9375, "logps/chosen": -604.0, "logps/rejected": -1016.0, "loss": 0.2957, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.4375, "rewards/margins": 4.0, "rewards/rejected": -8.4375, "step": 5020 }, { "epoch": 0.3480727977302609, "grad_norm": 23.31890337264225, "learning_rate": 4.1201157817603727e-07, "logits/chosen": -2.875, "logits/rejected": -2.96875, "logps/chosen": -612.0, "logps/rejected": -1020.0, "loss": 0.3709, "rewards/accuracies": 0.9375, "rewards/chosen": -4.25, "rewards/margins": 4.34375, "rewards/rejected": -8.5625, "step": 5030 }, { "epoch": 0.34876479136391947, "grad_norm": 20.742967269766083, "learning_rate": 4.11551159760513e-07, "logits/chosen": -2.875, "logits/rejected": -3.0, "logps/chosen": -608.0, "logps/rejected": -1008.0, "loss": 0.3116, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.1875, "rewards/margins": 4.0625, "rewards/rejected": -8.25, "step": 5040 }, { "epoch": 0.34945678499757804, "grad_norm": 23.277143386026705, "learning_rate": 4.1108979861157284e-07, "logits/chosen": -2.984375, "logits/rejected": -3.015625, "logps/chosen": -608.0, "logps/rejected": -1008.0, "loss": 0.2697, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 3.859375, "rewards/rejected": -8.375, "step": 5050 }, { "epoch": 0.3501487786312366, "grad_norm": 31.434231557303658, "learning_rate": 4.106274974214944e-07, "logits/chosen": -2.921875, "logits/rejected": -3.140625, "logps/chosen": -656.0, "logps/rejected": -1088.0, "loss": 0.3237, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.46875, "rewards/rejected": -9.25, "step": 5060 }, { "epoch": 0.3508407722648952, "grad_norm": 23.890922864234494, "learning_rate": 4.1016425888804094e-07, "logits/chosen": -2.9375, "logits/rejected": -3.046875, "logps/chosen": -640.0, "logps/rejected": -1096.0, "loss": 0.2746, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.625, "rewards/margins": 4.59375, "rewards/rejected": -9.25, "step": 5070 }, { "epoch": 0.35153276589855376, "grad_norm": 13.886900081374653, "learning_rate": 4.0970008571444546e-07, "logits/chosen": -3.015625, "logits/rejected": -3.15625, "logps/chosen": -612.0, "logps/rejected": -1056.0, "loss": 0.257, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.46875, "rewards/margins": 4.40625, "rewards/rejected": -8.875, "step": 5080 }, { "epoch": 0.35222475953221233, "grad_norm": 33.22834586454391, "learning_rate": 4.092349806093954e-07, "logits/chosen": -2.953125, "logits/rejected": -3.09375, "logps/chosen": -616.0, "logps/rejected": -1072.0, "loss": 0.3064, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.5, "rewards/margins": 4.53125, "rewards/rejected": -9.0625, "step": 5090 }, { "epoch": 0.3529167531658709, "grad_norm": 33.26726959576979, "learning_rate": 4.08768946287016e-07, "logits/chosen": -2.96875, "logits/rejected": -3.21875, "logps/chosen": -644.0, "logps/rejected": -1040.0, "loss": 0.273, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.65625, "rewards/margins": 4.15625, "rewards/rejected": -8.8125, "step": 5100 }, { "epoch": 0.35360874679952947, "grad_norm": 20.597005387355516, "learning_rate": 4.0830198546685547e-07, "logits/chosen": -2.953125, "logits/rejected": -3.234375, "logps/chosen": -624.0, "logps/rejected": -1064.0, "loss": 0.3029, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.46875, "rewards/margins": 4.59375, "rewards/rejected": -9.0625, "step": 5110 }, { "epoch": 0.354300740433188, "grad_norm": 20.669647290583423, "learning_rate": 4.0783410087386824e-07, "logits/chosen": -3.0625, "logits/rejected": -3.359375, "logps/chosen": -612.0, "logps/rejected": -1000.0, "loss": 0.2911, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.4375, "rewards/margins": 4.0625, "rewards/rejected": -8.5, "step": 5120 }, { "epoch": 0.35499273406684656, "grad_norm": 22.773319242635072, "learning_rate": 4.073652952383996e-07, "logits/chosen": -3.03125, "logits/rejected": -3.09375, "logps/chosen": -640.0, "logps/rejected": -1024.0, "loss": 0.3127, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.59375, "rewards/margins": 4.03125, "rewards/rejected": -8.625, "step": 5130 }, { "epoch": 0.35568472770050513, "grad_norm": 19.780219749283773, "learning_rate": 4.0689557129616946e-07, "logits/chosen": -2.875, "logits/rejected": -2.859375, "logps/chosen": -624.0, "logps/rejected": -1032.0, "loss": 0.2797, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.4375, "rewards/margins": 4.1875, "rewards/rejected": -8.625, "step": 5140 }, { "epoch": 0.3563767213341637, "grad_norm": 22.004847096284088, "learning_rate": 4.064249317882567e-07, "logits/chosen": -2.921875, "logits/rejected": -3.109375, "logps/chosen": -620.0, "logps/rejected": -1048.0, "loss": 0.3042, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.46875, "rewards/margins": 4.375, "rewards/rejected": -8.875, "step": 5150 }, { "epoch": 0.3570687149678223, "grad_norm": 24.293215326647157, "learning_rate": 4.059533794610829e-07, "logits/chosen": -2.875, "logits/rejected": -3.28125, "logps/chosen": -612.0, "logps/rejected": -1056.0, "loss": 0.2938, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.375, "rewards/margins": 4.78125, "rewards/rejected": -9.1875, "step": 5160 }, { "epoch": 0.35776070860148085, "grad_norm": 23.514221896408994, "learning_rate": 4.0548091706639636e-07, "logits/chosen": -2.890625, "logits/rejected": -2.984375, "logps/chosen": -600.0, "logps/rejected": -1032.0, "loss": 0.2638, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.21875, "rewards/margins": 4.4375, "rewards/rejected": -8.625, "step": 5170 }, { "epoch": 0.3584527022351394, "grad_norm": 19.739749814635733, "learning_rate": 4.050075473612562e-07, "logits/chosen": -2.921875, "logits/rejected": -2.90625, "logps/chosen": -640.0, "logps/rejected": -1056.0, "loss": 0.3048, "rewards/accuracies": 0.9375, "rewards/chosen": -4.625, "rewards/margins": 4.15625, "rewards/rejected": -8.75, "step": 5180 }, { "epoch": 0.359144695868798, "grad_norm": 25.23883757301326, "learning_rate": 4.0453327310801586e-07, "logits/chosen": -2.890625, "logits/rejected": -3.203125, "logps/chosen": -616.0, "logps/rejected": -1024.0, "loss": 0.3083, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.40625, "rewards/margins": 4.25, "rewards/rejected": -8.625, "step": 5190 }, { "epoch": 0.35983668950245656, "grad_norm": 18.119882161847954, "learning_rate": 4.0405809707430783e-07, "logits/chosen": -3.015625, "logits/rejected": -3.171875, "logps/chosen": -632.0, "logps/rejected": -1024.0, "loss": 0.2939, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.59375, "rewards/margins": 4.125, "rewards/rejected": -8.6875, "step": 5200 }, { "epoch": 0.36052868313611514, "grad_norm": 22.685298622989787, "learning_rate": 4.035820220330265e-07, "logits/chosen": -3.03125, "logits/rejected": -3.0625, "logps/chosen": -620.0, "logps/rejected": -1096.0, "loss": 0.3245, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.625, "rewards/margins": 4.71875, "rewards/rejected": -9.3125, "step": 5210 }, { "epoch": 0.3612206767697737, "grad_norm": 17.75757199172176, "learning_rate": 4.031050507623125e-07, "logits/chosen": -2.75, "logits/rejected": -3.0, "logps/chosen": -636.0, "logps/rejected": -1080.0, "loss": 0.3079, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.46875, "rewards/margins": 4.625, "rewards/rejected": -9.0625, "step": 5220 }, { "epoch": 0.3619126704034323, "grad_norm": 17.830782200010574, "learning_rate": 4.0262718604553645e-07, "logits/chosen": -2.90625, "logits/rejected": -3.1875, "logps/chosen": -628.0, "logps/rejected": -1032.0, "loss": 0.2857, "rewards/accuracies": 0.9375, "rewards/chosen": -4.5, "rewards/margins": 4.3125, "rewards/rejected": -8.8125, "step": 5230 }, { "epoch": 0.36260466403709085, "grad_norm": 24.50312442858543, "learning_rate": 4.021484306712829e-07, "logits/chosen": -2.890625, "logits/rejected": -3.0625, "logps/chosen": -584.0, "logps/rejected": -976.0, "loss": 0.2952, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.1875, "rewards/margins": 3.875, "rewards/rejected": -8.0625, "step": 5240 }, { "epoch": 0.3632966576707494, "grad_norm": 28.057557423683264, "learning_rate": 4.016687874333334e-07, "logits/chosen": -2.984375, "logits/rejected": -3.125, "logps/chosen": -612.0, "logps/rejected": -1040.0, "loss": 0.3086, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.40625, "rewards/margins": 4.40625, "rewards/rejected": -8.8125, "step": 5250 }, { "epoch": 0.363988651304408, "grad_norm": 21.004027791632005, "learning_rate": 4.01188259130651e-07, "logits/chosen": -2.890625, "logits/rejected": -2.84375, "logps/chosen": -644.0, "logps/rejected": -1056.0, "loss": 0.2567, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 4.21875, "rewards/rejected": -8.8125, "step": 5260 }, { "epoch": 0.36468064493806657, "grad_norm": 25.63366840997234, "learning_rate": 4.0070684856736347e-07, "logits/chosen": -2.96875, "logits/rejected": -3.0, "logps/chosen": -624.0, "logps/rejected": -1040.0, "loss": 0.2969, "rewards/accuracies": 0.9375, "rewards/chosen": -4.5625, "rewards/margins": 4.34375, "rewards/rejected": -8.9375, "step": 5270 }, { "epoch": 0.36537263857172514, "grad_norm": 17.45801154721632, "learning_rate": 4.0022455855274695e-07, "logits/chosen": -2.953125, "logits/rejected": -2.875, "logps/chosen": -612.0, "logps/rejected": -1072.0, "loss": 0.2639, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.34375, "rewards/margins": 4.5, "rewards/rejected": -8.8125, "step": 5280 }, { "epoch": 0.3660646322053837, "grad_norm": 23.501290004748718, "learning_rate": 3.997413919012097e-07, "logits/chosen": -2.875, "logits/rejected": -3.328125, "logps/chosen": -616.0, "logps/rejected": -1032.0, "loss": 0.2656, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5, "rewards/margins": 4.28125, "rewards/rejected": -8.75, "step": 5290 }, { "epoch": 0.3667566258390423, "grad_norm": 19.09326606447665, "learning_rate": 3.9925735143227555e-07, "logits/chosen": -3.078125, "logits/rejected": -3.15625, "logps/chosen": -604.0, "logps/rejected": -1056.0, "loss": 0.3069, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.53125, "rewards/margins": 4.46875, "rewards/rejected": -9.0, "step": 5300 }, { "epoch": 0.36744861947270085, "grad_norm": 19.869110673708434, "learning_rate": 3.9877243997056743e-07, "logits/chosen": -3.015625, "logits/rejected": -3.359375, "logps/chosen": -640.0, "logps/rejected": -1048.0, "loss": 0.284, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.625, "rewards/margins": 4.25, "rewards/rejected": -8.875, "step": 5310 }, { "epoch": 0.3681406131063594, "grad_norm": 20.498789317540076, "learning_rate": 3.9828666034579115e-07, "logits/chosen": -2.828125, "logits/rejected": -3.171875, "logps/chosen": -636.0, "logps/rejected": -1072.0, "loss": 0.3096, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.625, "rewards/margins": 4.5, "rewards/rejected": -9.125, "step": 5320 }, { "epoch": 0.368832606740018, "grad_norm": 19.869727460120345, "learning_rate": 3.978000153927187e-07, "logits/chosen": -2.921875, "logits/rejected": -3.1875, "logps/chosen": -600.0, "logps/rejected": -1040.0, "loss": 0.2787, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.15625, "rewards/margins": 4.65625, "rewards/rejected": -8.8125, "step": 5330 }, { "epoch": 0.36952460037367657, "grad_norm": 22.68159643184761, "learning_rate": 3.973125079511713e-07, "logits/chosen": -2.859375, "logits/rejected": -3.046875, "logps/chosen": -596.0, "logps/rejected": -1072.0, "loss": 0.2899, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.25, "rewards/margins": 4.71875, "rewards/rejected": -8.9375, "step": 5340 }, { "epoch": 0.37021659400733514, "grad_norm": 16.57620206204998, "learning_rate": 3.9682414086600374e-07, "logits/chosen": -3.0, "logits/rejected": -3.109375, "logps/chosen": -592.0, "logps/rejected": -1032.0, "loss": 0.2961, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.34375, "rewards/margins": 4.5, "rewards/rejected": -8.875, "step": 5350 }, { "epoch": 0.3709085876409937, "grad_norm": 17.86764078218946, "learning_rate": 3.96334916987087e-07, "logits/chosen": -2.8125, "logits/rejected": -2.875, "logps/chosen": -592.0, "logps/rejected": -1024.0, "loss": 0.3395, "rewards/accuracies": 0.9375, "rewards/chosen": -4.15625, "rewards/margins": 4.3125, "rewards/rejected": -8.4375, "step": 5360 }, { "epoch": 0.3716005812746523, "grad_norm": 18.975867144616306, "learning_rate": 3.958448391692919e-07, "logits/chosen": -2.96875, "logits/rejected": -3.0, "logps/chosen": -628.0, "logps/rejected": -1040.0, "loss": 0.3234, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.625, "rewards/margins": 4.0625, "rewards/rejected": -8.6875, "step": 5370 }, { "epoch": 0.37229257490831086, "grad_norm": 21.669953022869336, "learning_rate": 3.9535391027247255e-07, "logits/chosen": -3.015625, "logits/rejected": -3.375, "logps/chosen": -628.0, "logps/rejected": -1020.0, "loss": 0.2926, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.65625, "rewards/margins": 4.125, "rewards/rejected": -8.75, "step": 5380 }, { "epoch": 0.37298456854196943, "grad_norm": 15.770370077262823, "learning_rate": 3.9486213316144944e-07, "logits/chosen": -2.875, "logits/rejected": -2.953125, "logps/chosen": -636.0, "logps/rejected": -1020.0, "loss": 0.2721, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.34375, "rewards/margins": 4.03125, "rewards/rejected": -8.375, "step": 5390 }, { "epoch": 0.373676562175628, "grad_norm": 23.635871793293052, "learning_rate": 3.943695107059928e-07, "logits/chosen": -2.953125, "logits/rejected": -3.09375, "logps/chosen": -620.0, "logps/rejected": -1032.0, "loss": 0.2679, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.3125, "rewards/margins": 4.21875, "rewards/rejected": -8.5625, "step": 5400 }, { "epoch": 0.3743685558092866, "grad_norm": 17.803186556281986, "learning_rate": 3.938760457808059e-07, "logits/chosen": -3.0625, "logits/rejected": -3.1875, "logps/chosen": -688.0, "logps/rejected": -1056.0, "loss": 0.3091, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 3.828125, "rewards/rejected": -9.0, "step": 5410 }, { "epoch": 0.37506054944294515, "grad_norm": 28.6900988449045, "learning_rate": 3.933817412655083e-07, "logits/chosen": -2.90625, "logits/rejected": -3.125, "logps/chosen": -608.0, "logps/rejected": -1024.0, "loss": 0.2714, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.40625, "rewards/margins": 4.3125, "rewards/rejected": -8.6875, "step": 5420 }, { "epoch": 0.3757525430766037, "grad_norm": 20.423686546604397, "learning_rate": 3.9288660004461907e-07, "logits/chosen": -2.859375, "logits/rejected": -3.140625, "logps/chosen": -656.0, "logps/rejected": -1032.0, "loss": 0.3202, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.4375, "rewards/margins": 4.1875, "rewards/rejected": -8.625, "step": 5430 }, { "epoch": 0.3764445367102623, "grad_norm": 15.339870067650786, "learning_rate": 3.9239062500753957e-07, "logits/chosen": -2.796875, "logits/rejected": -3.0, "logps/chosen": -696.0, "logps/rejected": -1064.0, "loss": 0.266, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.875, "rewards/margins": 3.953125, "rewards/rejected": -8.8125, "step": 5440 }, { "epoch": 0.37713653034392086, "grad_norm": 22.1250781038847, "learning_rate": 3.9189381904853724e-07, "logits/chosen": -2.859375, "logits/rejected": -3.0, "logps/chosen": -620.0, "logps/rejected": -1064.0, "loss": 0.2957, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.5625, "rewards/margins": 4.46875, "rewards/rejected": -9.0625, "step": 5450 }, { "epoch": 0.37782852397757943, "grad_norm": 24.828177761609982, "learning_rate": 3.913961850667282e-07, "logits/chosen": -2.921875, "logits/rejected": -3.0625, "logps/chosen": -624.0, "logps/rejected": -1064.0, "loss": 0.2688, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5, "rewards/margins": 4.46875, "rewards/rejected": -9.0, "step": 5460 }, { "epoch": 0.37852051761123795, "grad_norm": 22.04177919728768, "learning_rate": 3.9089772596606066e-07, "logits/chosen": -2.84375, "logits/rejected": -3.046875, "logps/chosen": -604.0, "logps/rejected": -1012.0, "loss": 0.2525, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.28125, "rewards/margins": 4.1875, "rewards/rejected": -8.5, "step": 5470 }, { "epoch": 0.3792125112448965, "grad_norm": 30.7015102827407, "learning_rate": 3.903984446552976e-07, "logits/chosen": -2.953125, "logits/rejected": -3.015625, "logps/chosen": -580.0, "logps/rejected": -1040.0, "loss": 0.3168, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.15625, "rewards/margins": 4.5625, "rewards/rejected": -8.75, "step": 5480 }, { "epoch": 0.3799045048785551, "grad_norm": 20.111661200020016, "learning_rate": 3.8989834404800013e-07, "logits/chosen": -2.828125, "logits/rejected": -3.0625, "logps/chosen": -600.0, "logps/rejected": -1000.0, "loss": 0.2849, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.125, "rewards/margins": 4.09375, "rewards/rejected": -8.1875, "step": 5490 }, { "epoch": 0.38059649851221367, "grad_norm": 18.698785531673217, "learning_rate": 3.8939742706251047e-07, "logits/chosen": -2.96875, "logits/rejected": -3.234375, "logps/chosen": -624.0, "logps/rejected": -996.0, "loss": 0.2508, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.5, "rewards/margins": 3.8125, "rewards/rejected": -8.3125, "step": 5500 }, { "epoch": 0.38128849214587224, "grad_norm": 16.86745676836903, "learning_rate": 3.888956966219346e-07, "logits/chosen": -3.0, "logits/rejected": -3.21875, "logps/chosen": -604.0, "logps/rejected": -1032.0, "loss": 0.2703, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.3125, "rewards/margins": 4.375, "rewards/rejected": -8.6875, "step": 5510 }, { "epoch": 0.3819804857795308, "grad_norm": 15.606408378789643, "learning_rate": 3.8839315565412554e-07, "logits/chosen": -2.96875, "logits/rejected": -3.09375, "logps/chosen": -640.0, "logps/rejected": -1080.0, "loss": 0.2892, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.625, "rewards/margins": 4.5, "rewards/rejected": -9.125, "step": 5520 }, { "epoch": 0.3826724794131894, "grad_norm": 26.24608568427094, "learning_rate": 3.878898070916662e-07, "logits/chosen": -3.03125, "logits/rejected": -3.296875, "logps/chosen": -664.0, "logps/rejected": -1088.0, "loss": 0.283, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.875, "rewards/margins": 4.59375, "rewards/rejected": -9.4375, "step": 5530 }, { "epoch": 0.38336447304684795, "grad_norm": 22.2381172664658, "learning_rate": 3.8738565387185227e-07, "logits/chosen": -3.015625, "logits/rejected": -3.234375, "logps/chosen": -648.0, "logps/rejected": -1104.0, "loss": 0.2583, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.59375, "rewards/margins": 4.78125, "rewards/rejected": -9.375, "step": 5540 }, { "epoch": 0.3840564666805065, "grad_norm": 39.37828404524332, "learning_rate": 3.868806989366748e-07, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -624.0, "logps/rejected": -1080.0, "loss": 0.2761, "rewards/accuracies": 0.9375, "rewards/chosen": -4.40625, "rewards/margins": 4.75, "rewards/rejected": -9.125, "step": 5550 }, { "epoch": 0.3847484603141651, "grad_norm": 26.44056384100328, "learning_rate": 3.8637494523280343e-07, "logits/chosen": -2.84375, "logits/rejected": -3.21875, "logps/chosen": -620.0, "logps/rejected": -1032.0, "loss": 0.2466, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.375, "rewards/margins": 4.34375, "rewards/rejected": -8.75, "step": 5560 }, { "epoch": 0.38544045394782367, "grad_norm": 19.611019390548954, "learning_rate": 3.8586839571156893e-07, "logits/chosen": -2.90625, "logits/rejected": -3.15625, "logps/chosen": -616.0, "logps/rejected": -1040.0, "loss": 0.2412, "rewards/accuracies": 0.96875, "rewards/chosen": -4.40625, "rewards/margins": 4.53125, "rewards/rejected": -8.9375, "step": 5570 }, { "epoch": 0.38613244758148224, "grad_norm": 15.35080427339997, "learning_rate": 3.8536105332894626e-07, "logits/chosen": -2.90625, "logits/rejected": -3.046875, "logps/chosen": -636.0, "logps/rejected": -1064.0, "loss": 0.2485, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.625, "rewards/margins": 4.53125, "rewards/rejected": -9.125, "step": 5580 }, { "epoch": 0.3868244412151408, "grad_norm": 17.799815717184106, "learning_rate": 3.848529210455368e-07, "logits/chosen": -3.03125, "logits/rejected": -3.09375, "logps/chosen": -608.0, "logps/rejected": -1080.0, "loss": 0.3157, "rewards/accuracies": 0.9375, "rewards/chosen": -4.34375, "rewards/margins": 4.4375, "rewards/rejected": -8.75, "step": 5590 }, { "epoch": 0.3875164348487994, "grad_norm": 19.64768064961822, "learning_rate": 3.843440018265518e-07, "logits/chosen": -2.96875, "logits/rejected": -3.1875, "logps/chosen": -620.0, "logps/rejected": -1064.0, "loss": 0.2558, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.40625, "rewards/margins": 4.5625, "rewards/rejected": -9.0, "step": 5600 }, { "epoch": 0.38820842848245796, "grad_norm": 31.31262403655744, "learning_rate": 3.8383429864179426e-07, "logits/chosen": -2.78125, "logits/rejected": -3.015625, "logps/chosen": -624.0, "logps/rejected": -1112.0, "loss": 0.3112, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.53125, "rewards/margins": 4.875, "rewards/rejected": -9.375, "step": 5610 }, { "epoch": 0.38890042211611653, "grad_norm": 17.282845496123254, "learning_rate": 3.833238144656425e-07, "logits/chosen": -2.890625, "logits/rejected": -2.921875, "logps/chosen": -616.0, "logps/rejected": -1104.0, "loss": 0.2716, "rewards/accuracies": 0.96875, "rewards/chosen": -4.28125, "rewards/margins": 5.0625, "rewards/rejected": -9.375, "step": 5620 }, { "epoch": 0.3895924157497751, "grad_norm": 28.302726044262762, "learning_rate": 3.828125522770317e-07, "logits/chosen": -2.90625, "logits/rejected": -3.1875, "logps/chosen": -620.0, "logps/rejected": -1056.0, "loss": 0.2639, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.34375, "rewards/margins": 4.53125, "rewards/rejected": -8.875, "step": 5630 }, { "epoch": 0.3902844093834337, "grad_norm": 33.38971763991421, "learning_rate": 3.823005150594379e-07, "logits/chosen": -3.0625, "logits/rejected": -3.203125, "logps/chosen": -616.0, "logps/rejected": -1056.0, "loss": 0.2841, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5, "rewards/margins": 4.40625, "rewards/rejected": -8.875, "step": 5640 }, { "epoch": 0.39097640301709224, "grad_norm": 22.79075143117563, "learning_rate": 3.8178770580085914e-07, "logits/chosen": -3.03125, "logits/rejected": -3.140625, "logps/chosen": -616.0, "logps/rejected": -1064.0, "loss": 0.3016, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.40625, "rewards/margins": 4.5, "rewards/rejected": -8.9375, "step": 5650 }, { "epoch": 0.3916683966507508, "grad_norm": 26.791312251120527, "learning_rate": 3.8127412749379905e-07, "logits/chosen": -2.96875, "logits/rejected": -3.125, "logps/chosen": -600.0, "logps/rejected": -1024.0, "loss": 0.2946, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.3125, "rewards/margins": 4.21875, "rewards/rejected": -8.5, "step": 5660 }, { "epoch": 0.3923603902844094, "grad_norm": 16.971840135220162, "learning_rate": 3.807597831352491e-07, "logits/chosen": -2.875, "logits/rejected": -2.953125, "logps/chosen": -644.0, "logps/rejected": -1064.0, "loss": 0.3125, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.1875, "rewards/rejected": -8.8125, "step": 5670 }, { "epoch": 0.39305238391806796, "grad_norm": 22.905774111804593, "learning_rate": 3.8024467572667093e-07, "logits/chosen": -3.015625, "logits/rejected": -3.109375, "logps/chosen": -632.0, "logps/rejected": -1064.0, "loss": 0.2882, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.71875, "rewards/margins": 4.3125, "rewards/rejected": -9.0, "step": 5680 }, { "epoch": 0.39374437755172653, "grad_norm": 32.677819651699174, "learning_rate": 3.797288082739789e-07, "logits/chosen": -2.875, "logits/rejected": -3.046875, "logps/chosen": -628.0, "logps/rejected": -1056.0, "loss": 0.2993, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.53125, "rewards/margins": 4.46875, "rewards/rejected": -9.0, "step": 5690 }, { "epoch": 0.3944363711853851, "grad_norm": 29.10044027737681, "learning_rate": 3.7921218378752284e-07, "logits/chosen": -2.96875, "logits/rejected": -3.109375, "logps/chosen": -620.0, "logps/rejected": -1080.0, "loss": 0.2701, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.625, "rewards/margins": 4.53125, "rewards/rejected": -9.125, "step": 5700 }, { "epoch": 0.3951283648190437, "grad_norm": 23.87926886134571, "learning_rate": 3.786948052820701e-07, "logits/chosen": -3.046875, "logits/rejected": -3.265625, "logps/chosen": -640.0, "logps/rejected": -1080.0, "loss": 0.3013, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.71875, "rewards/margins": 4.40625, "rewards/rejected": -9.125, "step": 5710 }, { "epoch": 0.39582035845270225, "grad_norm": 15.932264576872452, "learning_rate": 3.78176675776788e-07, "logits/chosen": -2.9375, "logits/rejected": -3.09375, "logps/chosen": -624.0, "logps/rejected": -1072.0, "loss": 0.2743, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.46875, "rewards/margins": 4.5, "rewards/rejected": -8.9375, "step": 5720 }, { "epoch": 0.3965123520863608, "grad_norm": 25.75674816384029, "learning_rate": 3.776577982952267e-07, "logits/chosen": -2.96875, "logits/rejected": -3.21875, "logps/chosen": -624.0, "logps/rejected": -1096.0, "loss": 0.2934, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.53125, "rewards/margins": 4.8125, "rewards/rejected": -9.375, "step": 5730 }, { "epoch": 0.3972043457200194, "grad_norm": 21.35662323168912, "learning_rate": 3.7713817586530085e-07, "logits/chosen": -2.84375, "logits/rejected": -2.828125, "logps/chosen": -640.0, "logps/rejected": -1128.0, "loss": 0.316, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.8125, "rewards/rejected": -9.5625, "step": 5740 }, { "epoch": 0.39789633935367796, "grad_norm": 21.355770871296347, "learning_rate": 3.7661781151927243e-07, "logits/chosen": -3.0, "logits/rejected": -3.265625, "logps/chosen": -636.0, "logps/rejected": -1064.0, "loss": 0.2469, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 4.3125, "rewards/rejected": -9.0625, "step": 5750 }, { "epoch": 0.39858833298733654, "grad_norm": 30.083425067700325, "learning_rate": 3.760967082937326e-07, "logits/chosen": -2.953125, "logits/rejected": -3.09375, "logps/chosen": -584.0, "logps/rejected": -1072.0, "loss": 0.2986, "rewards/accuracies": 0.9375, "rewards/chosen": -4.25, "rewards/margins": 4.84375, "rewards/rejected": -9.0625, "step": 5760 }, { "epoch": 0.3992803266209951, "grad_norm": 19.173740724242123, "learning_rate": 3.755748692295847e-07, "logits/chosen": -2.875, "logits/rejected": -3.078125, "logps/chosen": -644.0, "logps/rejected": -1040.0, "loss": 0.3251, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.125, "rewards/rejected": -8.8125, "step": 5770 }, { "epoch": 0.3999723202546537, "grad_norm": 15.01696169632013, "learning_rate": 3.750522973720257e-07, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -600.0, "logps/rejected": -1048.0, "loss": 0.2708, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.03125, "rewards/margins": 4.59375, "rewards/rejected": -8.625, "step": 5780 }, { "epoch": 0.40066431388831225, "grad_norm": 23.869961307777693, "learning_rate": 3.7452899577052905e-07, "logits/chosen": -2.9375, "logits/rejected": -3.03125, "logps/chosen": -640.0, "logps/rejected": -1048.0, "loss": 0.2764, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.625, "rewards/margins": 4.125, "rewards/rejected": -8.75, "step": 5790 }, { "epoch": 0.4013563075219708, "grad_norm": 12.715778996663404, "learning_rate": 3.7400496747882635e-07, "logits/chosen": -2.953125, "logits/rejected": -2.9375, "logps/chosen": -672.0, "logps/rejected": -1096.0, "loss": 0.2561, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 4.34375, "rewards/rejected": -9.1875, "step": 5800 }, { "epoch": 0.4020483011556294, "grad_norm": 70.04412557485962, "learning_rate": 3.734802155548901e-07, "logits/chosen": -2.84375, "logits/rejected": -3.125, "logps/chosen": -664.0, "logps/rejected": -1064.0, "loss": 0.2976, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.34375, "rewards/rejected": -9.125, "step": 5810 }, { "epoch": 0.4027402947892879, "grad_norm": 18.628227546720264, "learning_rate": 3.7295474306091524e-07, "logits/chosen": -2.84375, "logits/rejected": -2.890625, "logps/chosen": -616.0, "logps/rejected": -1040.0, "loss": 0.2815, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.40625, "rewards/margins": 4.28125, "rewards/rejected": -8.6875, "step": 5820 }, { "epoch": 0.4034322884229465, "grad_norm": 30.29548806786079, "learning_rate": 3.72428553063302e-07, "logits/chosen": -2.90625, "logits/rejected": -2.984375, "logps/chosen": -624.0, "logps/rejected": -1064.0, "loss": 0.2884, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.59375, "rewards/margins": 4.34375, "rewards/rejected": -8.9375, "step": 5830 }, { "epoch": 0.40412428205660506, "grad_norm": 18.31379824269021, "learning_rate": 3.719016486326373e-07, "logits/chosen": -3.015625, "logits/rejected": -3.28125, "logps/chosen": -652.0, "logps/rejected": -1088.0, "loss": 0.2276, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.375, "rewards/rejected": -9.1875, "step": 5840 }, { "epoch": 0.4048162756902636, "grad_norm": 17.93443919239363, "learning_rate": 3.713740328436772e-07, "logits/chosen": -2.9375, "logits/rejected": -2.96875, "logps/chosen": -656.0, "logps/rejected": -1072.0, "loss": 0.2822, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71875, "rewards/margins": 4.34375, "rewards/rejected": -9.0625, "step": 5850 }, { "epoch": 0.4055082693239222, "grad_norm": 27.36255038740683, "learning_rate": 3.70845708775329e-07, "logits/chosen": -2.875, "logits/rejected": -2.78125, "logps/chosen": -632.0, "logps/rejected": -1088.0, "loss": 0.2941, "rewards/accuracies": 0.9375, "rewards/chosen": -4.65625, "rewards/margins": 4.4375, "rewards/rejected": -9.0625, "step": 5860 }, { "epoch": 0.40620026295758077, "grad_norm": 17.60844922816818, "learning_rate": 3.7031667951063314e-07, "logits/chosen": -3.0, "logits/rejected": -3.109375, "logps/chosen": -624.0, "logps/rejected": -1048.0, "loss": 0.2443, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.21875, "rewards/rejected": -8.875, "step": 5870 }, { "epoch": 0.40689225659123934, "grad_norm": 25.26797581426202, "learning_rate": 3.6978694813674513e-07, "logits/chosen": -3.046875, "logits/rejected": -3.21875, "logps/chosen": -680.0, "logps/rejected": -1104.0, "loss": 0.2636, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.46875, "rewards/rejected": -9.25, "step": 5880 }, { "epoch": 0.4075842502248979, "grad_norm": 18.99942818007725, "learning_rate": 3.692565177449177e-07, "logits/chosen": -3.015625, "logits/rejected": -3.40625, "logps/chosen": -656.0, "logps/rejected": -1088.0, "loss": 0.2813, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.53125, "rewards/rejected": -9.5, "step": 5890 }, { "epoch": 0.4082762438585565, "grad_norm": 27.176758262726338, "learning_rate": 3.687253914304828e-07, "logits/chosen": -2.890625, "logits/rejected": -3.03125, "logps/chosen": -660.0, "logps/rejected": -1040.0, "loss": 0.3036, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.84375, "rewards/margins": 3.921875, "rewards/rejected": -8.75, "step": 5900 }, { "epoch": 0.40896823749221506, "grad_norm": 22.09492538330793, "learning_rate": 3.6819357229283343e-07, "logits/chosen": -2.96875, "logits/rejected": -3.03125, "logps/chosen": -632.0, "logps/rejected": -1064.0, "loss": 0.2622, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.53125, "rewards/margins": 4.4375, "rewards/rejected": -9.0, "step": 5910 }, { "epoch": 0.40966023112587363, "grad_norm": 19.28971955818601, "learning_rate": 3.6766106343540536e-07, "logits/chosen": -2.90625, "logits/rejected": -2.96875, "logps/chosen": -628.0, "logps/rejected": -1096.0, "loss": 0.2566, "rewards/accuracies": 0.9375, "rewards/chosen": -4.5625, "rewards/margins": 4.65625, "rewards/rejected": -9.1875, "step": 5920 }, { "epoch": 0.4103522247595322, "grad_norm": 15.774943861767591, "learning_rate": 3.671278679656595e-07, "logits/chosen": -3.03125, "logits/rejected": -3.078125, "logps/chosen": -648.0, "logps/rejected": -1088.0, "loss": 0.315, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.65625, "rewards/margins": 4.46875, "rewards/rejected": -9.125, "step": 5930 }, { "epoch": 0.4110442183931908, "grad_norm": 13.8825457477747, "learning_rate": 3.665939889950634e-07, "logits/chosen": -2.984375, "logits/rejected": -3.21875, "logps/chosen": -652.0, "logps/rejected": -1048.0, "loss": 0.2816, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 4.21875, "rewards/rejected": -8.9375, "step": 5940 }, { "epoch": 0.41173621202684935, "grad_norm": 29.455917553841147, "learning_rate": 3.6605942963907294e-07, "logits/chosen": -2.9375, "logits/rejected": -3.09375, "logps/chosen": -648.0, "logps/rejected": -1064.0, "loss": 0.2786, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.3125, "rewards/rejected": -9.0, "step": 5950 }, { "epoch": 0.4124282056605079, "grad_norm": 32.08592513212122, "learning_rate": 3.6552419301711485e-07, "logits/chosen": -2.953125, "logits/rejected": -3.203125, "logps/chosen": -636.0, "logps/rejected": -1072.0, "loss": 0.2768, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 4.59375, "rewards/rejected": -9.125, "step": 5960 }, { "epoch": 0.4131201992941665, "grad_norm": 27.22349625927251, "learning_rate": 3.649882822525677e-07, "logits/chosen": -2.953125, "logits/rejected": -3.25, "logps/chosen": -652.0, "logps/rejected": -1104.0, "loss": 0.2641, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 4.625, "rewards/rejected": -9.3125, "step": 5970 }, { "epoch": 0.41381219292782506, "grad_norm": 24.55729294724555, "learning_rate": 3.6445170047274416e-07, "logits/chosen": -3.09375, "logits/rejected": -3.078125, "logps/chosen": -676.0, "logps/rejected": -1096.0, "loss": 0.2513, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0, "rewards/margins": 4.21875, "rewards/rejected": -9.1875, "step": 5980 }, { "epoch": 0.41450418656148363, "grad_norm": 21.3090383123024, "learning_rate": 3.639144508088726e-07, "logits/chosen": -3.09375, "logits/rejected": -3.3125, "logps/chosen": -628.0, "logps/rejected": -1112.0, "loss": 0.2912, "rewards/accuracies": 0.9375, "rewards/chosen": -4.53125, "rewards/margins": 4.875, "rewards/rejected": -9.4375, "step": 5990 }, { "epoch": 0.4151961801951422, "grad_norm": 21.427501760431287, "learning_rate": 3.6337653639607885e-07, "logits/chosen": -3.03125, "logits/rejected": -3.359375, "logps/chosen": -636.0, "logps/rejected": -1048.0, "loss": 0.2966, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.78125, "rewards/margins": 4.34375, "rewards/rejected": -9.125, "step": 6000 }, { "epoch": 0.4151961801951422, "eval_logits/chosen": -2.921875, "eval_logits/rejected": -3.109375, "eval_logps/chosen": -672.0, "eval_logps/rejected": -1048.0, "eval_loss": 0.2331225872039795, "eval_rewards/accuracies": 0.8928220868110657, "eval_rewards/chosen": -4.84375, "eval_rewards/margins": 4.0, "eval_rewards/rejected": -8.8125, "eval_runtime": 2934.0606, "eval_samples_per_second": 33.344, "eval_steps_per_second": 0.521, "step": 6000 }, { "epoch": 0.4158881738288008, "grad_norm": 22.583808559276456, "learning_rate": 3.628379603733678e-07, "logits/chosen": -3.0, "logits/rejected": -3.203125, "logps/chosen": -600.0, "logps/rejected": -1144.0, "loss": 0.2751, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.3125, "rewards/margins": 5.46875, "rewards/rejected": -9.8125, "step": 6010 }, { "epoch": 0.41658016746245935, "grad_norm": 21.604940073831386, "learning_rate": 3.622987258836054e-07, "logits/chosen": -2.859375, "logits/rejected": -3.0, "logps/chosen": -640.0, "logps/rejected": -1112.0, "loss": 0.2765, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.65625, "rewards/margins": 4.625, "rewards/rejected": -9.3125, "step": 6020 }, { "epoch": 0.4172721610961179, "grad_norm": 24.84844811286084, "learning_rate": 3.6175883607349974e-07, "logits/chosen": -2.796875, "logits/rejected": -2.859375, "logps/chosen": -648.0, "logps/rejected": -1104.0, "loss": 0.2791, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 4.4375, "rewards/rejected": -9.1875, "step": 6030 }, { "epoch": 0.4179641547297765, "grad_norm": 15.930116389920894, "learning_rate": 3.6121829409358333e-07, "logits/chosen": -2.875, "logits/rejected": -2.90625, "logps/chosen": -640.0, "logps/rejected": -1072.0, "loss": 0.2521, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.40625, "rewards/rejected": -9.0625, "step": 6040 }, { "epoch": 0.41865614836343507, "grad_norm": 14.61124492618345, "learning_rate": 3.606771030981943e-07, "logits/chosen": -2.859375, "logits/rejected": -2.9375, "logps/chosen": -632.0, "logps/rejected": -1072.0, "loss": 0.2879, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.59375, "rewards/margins": 4.59375, "rewards/rejected": -9.1875, "step": 6050 }, { "epoch": 0.41934814199709364, "grad_norm": 28.993574157991397, "learning_rate": 3.601352662454582e-07, "logits/chosen": -2.890625, "logits/rejected": -3.125, "logps/chosen": -664.0, "logps/rejected": -1088.0, "loss": 0.3205, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.40625, "rewards/rejected": -9.25, "step": 6060 }, { "epoch": 0.4200401356307522, "grad_norm": 21.65717305112721, "learning_rate": 3.595927866972693e-07, "logits/chosen": -2.828125, "logits/rejected": -3.046875, "logps/chosen": -632.0, "logps/rejected": -1048.0, "loss": 0.2528, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 4.3125, "rewards/rejected": -8.875, "step": 6070 }, { "epoch": 0.4207321292644108, "grad_norm": 29.47214544313303, "learning_rate": 3.5904966761927266e-07, "logits/chosen": -2.890625, "logits/rejected": -3.046875, "logps/chosen": -608.0, "logps/rejected": -1040.0, "loss": 0.2695, "rewards/accuracies": 0.9375, "rewards/chosen": -4.375, "rewards/margins": 4.375, "rewards/rejected": -8.75, "step": 6080 }, { "epoch": 0.42142412289806935, "grad_norm": 21.5630226541659, "learning_rate": 3.585059121808449e-07, "logits/chosen": -2.9375, "logits/rejected": -3.125, "logps/chosen": -624.0, "logps/rejected": -1040.0, "loss": 0.2932, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.53125, "rewards/margins": 4.375, "rewards/rejected": -8.875, "step": 6090 }, { "epoch": 0.4221161165317279, "grad_norm": 26.35989055581321, "learning_rate": 3.579615235550765e-07, "logits/chosen": -2.875, "logits/rejected": -2.890625, "logps/chosen": -580.0, "logps/rejected": -1020.0, "loss": 0.2817, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.125, "rewards/margins": 4.59375, "rewards/rejected": -8.75, "step": 6100 }, { "epoch": 0.4228081101653865, "grad_norm": 19.07980574390856, "learning_rate": 3.5741650491875266e-07, "logits/chosen": -2.96875, "logits/rejected": -3.140625, "logps/chosen": -616.0, "logps/rejected": -1040.0, "loss": 0.2614, "rewards/accuracies": 0.90625, "rewards/chosen": -4.5625, "rewards/margins": 4.15625, "rewards/rejected": -8.75, "step": 6110 }, { "epoch": 0.42350010379904507, "grad_norm": 20.6245222711257, "learning_rate": 3.568708594523351e-07, "logits/chosen": -2.765625, "logits/rejected": -3.234375, "logps/chosen": -628.0, "logps/rejected": -1072.0, "loss": 0.2629, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5, "rewards/margins": 4.625, "rewards/rejected": -9.125, "step": 6120 }, { "epoch": 0.42419209743270364, "grad_norm": 19.0115987439649, "learning_rate": 3.563245903399435e-07, "logits/chosen": -3.03125, "logits/rejected": -3.1875, "logps/chosen": -652.0, "logps/rejected": -1128.0, "loss": 0.2879, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.0, "rewards/rejected": -9.75, "step": 6130 }, { "epoch": 0.4248840910663622, "grad_norm": 19.66113389542264, "learning_rate": 3.5577770076933673e-07, "logits/chosen": -2.953125, "logits/rejected": -3.0, "logps/chosen": -692.0, "logps/rejected": -1184.0, "loss": 0.3054, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 5.03125, "rewards/rejected": -10.0, "step": 6140 }, { "epoch": 0.4255760847000208, "grad_norm": 28.598754982491833, "learning_rate": 3.5523019393189437e-07, "logits/chosen": -2.859375, "logits/rejected": -3.046875, "logps/chosen": -624.0, "logps/rejected": -1072.0, "loss": 0.2689, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5, "rewards/margins": 4.71875, "rewards/rejected": -9.1875, "step": 6150 }, { "epoch": 0.42626807833367936, "grad_norm": 17.16461292685636, "learning_rate": 3.54682073022598e-07, "logits/chosen": -3.0, "logits/rejected": -3.3125, "logps/chosen": -620.0, "logps/rejected": -1080.0, "loss": 0.2352, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.59375, "rewards/margins": 4.6875, "rewards/rejected": -9.25, "step": 6160 }, { "epoch": 0.4269600719673379, "grad_norm": 27.398988011871374, "learning_rate": 3.541333412400128e-07, "logits/chosen": -2.9375, "logits/rejected": -3.03125, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.2672, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 4.96875, "rewards/rejected": -9.6875, "step": 6170 }, { "epoch": 0.42765206560099644, "grad_norm": 20.489650013650632, "learning_rate": 3.535840017862685e-07, "logits/chosen": -2.90625, "logits/rejected": -3.125, "logps/chosen": -644.0, "logps/rejected": -1096.0, "loss": 0.2638, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.59375, "rewards/margins": 4.625, "rewards/rejected": -9.1875, "step": 6180 }, { "epoch": 0.428344059234655, "grad_norm": 18.73280476715156, "learning_rate": 3.5303405786704117e-07, "logits/chosen": -2.90625, "logits/rejected": -2.859375, "logps/chosen": -624.0, "logps/rejected": -1080.0, "loss": 0.2765, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 4.375, "rewards/rejected": -8.9375, "step": 6190 }, { "epoch": 0.4290360528683136, "grad_norm": 24.83892445534026, "learning_rate": 3.5248351269153387e-07, "logits/chosen": -2.9375, "logits/rejected": -3.34375, "logps/chosen": -672.0, "logps/rejected": -1064.0, "loss": 0.2832, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.875, "rewards/margins": 4.25, "rewards/rejected": -9.125, "step": 6200 }, { "epoch": 0.42972804650197216, "grad_norm": 22.73924875397022, "learning_rate": 3.519323694724586e-07, "logits/chosen": -3.0, "logits/rejected": -3.09375, "logps/chosen": -684.0, "logps/rejected": -1136.0, "loss": 0.2885, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.65625, "rewards/rejected": -9.625, "step": 6210 }, { "epoch": 0.43042004013563073, "grad_norm": 22.374797791926138, "learning_rate": 3.5138063142601715e-07, "logits/chosen": -3.0625, "logits/rejected": -3.25, "logps/chosen": -660.0, "logps/rejected": -1120.0, "loss": 0.2635, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 4.5625, "rewards/rejected": -9.375, "step": 6220 }, { "epoch": 0.4311120337692893, "grad_norm": 23.288480699802825, "learning_rate": 3.508283017718824e-07, "logits/chosen": -2.859375, "logits/rejected": -3.09375, "logps/chosen": -584.0, "logps/rejected": -1024.0, "loss": 0.275, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.15625, "rewards/margins": 4.5, "rewards/rejected": -8.6875, "step": 6230 }, { "epoch": 0.4318040274029479, "grad_norm": 25.795354973884297, "learning_rate": 3.5027538373317967e-07, "logits/chosen": -2.765625, "logits/rejected": -2.984375, "logps/chosen": -608.0, "logps/rejected": -1072.0, "loss": 0.2567, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.34375, "rewards/margins": 4.96875, "rewards/rejected": -9.3125, "step": 6240 }, { "epoch": 0.43249602103660645, "grad_norm": 30.48237067345401, "learning_rate": 3.4972188053646766e-07, "logits/chosen": -2.859375, "logits/rejected": -3.140625, "logps/chosen": -692.0, "logps/rejected": -1128.0, "loss": 0.3052, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.625, "rewards/rejected": -9.625, "step": 6250 }, { "epoch": 0.433188014670265, "grad_norm": 29.226529582916385, "learning_rate": 3.4916779541171994e-07, "logits/chosen": -2.96875, "logits/rejected": -3.21875, "logps/chosen": -696.0, "logps/rejected": -1136.0, "loss": 0.2754, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.5, "rewards/rejected": -9.625, "step": 6260 }, { "epoch": 0.4338800083039236, "grad_norm": 19.03718702584664, "learning_rate": 3.4861313159230565e-07, "logits/chosen": -2.921875, "logits/rejected": -3.125, "logps/chosen": -656.0, "logps/rejected": -1080.0, "loss": 0.292, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.625, "rewards/margins": 4.40625, "rewards/rejected": -9.0625, "step": 6270 }, { "epoch": 0.43457200193758216, "grad_norm": 15.914681065855484, "learning_rate": 3.4805789231497125e-07, "logits/chosen": -3.03125, "logits/rejected": -3.140625, "logps/chosen": -604.0, "logps/rejected": -1056.0, "loss": 0.2661, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.53125, "rewards/margins": 4.5625, "rewards/rejected": -9.0625, "step": 6280 }, { "epoch": 0.43526399557124074, "grad_norm": 16.555755755538804, "learning_rate": 3.4750208081982094e-07, "logits/chosen": -3.0, "logits/rejected": -3.171875, "logps/chosen": -648.0, "logps/rejected": -1096.0, "loss": 0.2839, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 4.5625, "rewards/rejected": -9.3125, "step": 6290 }, { "epoch": 0.4359559892048993, "grad_norm": 17.446164036005325, "learning_rate": 3.4694570035029845e-07, "logits/chosen": -2.828125, "logits/rejected": -2.921875, "logps/chosen": -672.0, "logps/rejected": -1088.0, "loss": 0.2928, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.34375, "rewards/rejected": -9.25, "step": 6300 }, { "epoch": 0.4366479828385579, "grad_norm": 17.610981777251645, "learning_rate": 3.463887541531676e-07, "logits/chosen": -2.859375, "logits/rejected": -2.84375, "logps/chosen": -632.0, "logps/rejected": -1128.0, "loss": 0.2321, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.59375, "rewards/margins": 4.8125, "rewards/rejected": -9.4375, "step": 6310 }, { "epoch": 0.43733997647221645, "grad_norm": 20.64476155647373, "learning_rate": 3.458312454784934e-07, "logits/chosen": -2.90625, "logits/rejected": -3.109375, "logps/chosen": -628.0, "logps/rejected": -1064.0, "loss": 0.2587, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5, "rewards/margins": 4.6875, "rewards/rejected": -9.1875, "step": 6320 }, { "epoch": 0.438031970105875, "grad_norm": 16.622473573657363, "learning_rate": 3.452731775796234e-07, "logits/chosen": -2.90625, "logits/rejected": -3.265625, "logps/chosen": -648.0, "logps/rejected": -1104.0, "loss": 0.2366, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 4.6875, "rewards/rejected": -9.375, "step": 6330 }, { "epoch": 0.4387239637395336, "grad_norm": 18.80133912081461, "learning_rate": 3.447145537131684e-07, "logits/chosen": -2.9375, "logits/rejected": -3.203125, "logps/chosen": -632.0, "logps/rejected": -1064.0, "loss": 0.2644, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.59375, "rewards/margins": 4.3125, "rewards/rejected": -8.9375, "step": 6340 }, { "epoch": 0.43941595737319217, "grad_norm": 26.744891650136243, "learning_rate": 3.4415537713898365e-07, "logits/chosen": -2.875, "logits/rejected": -3.1875, "logps/chosen": -648.0, "logps/rejected": -1088.0, "loss": 0.2739, "rewards/accuracies": 0.90625, "rewards/chosen": -4.59375, "rewards/margins": 4.46875, "rewards/rejected": -9.0625, "step": 6350 }, { "epoch": 0.44010795100685074, "grad_norm": 16.786503744914647, "learning_rate": 3.4359565112014954e-07, "logits/chosen": -2.890625, "logits/rejected": -3.203125, "logps/chosen": -628.0, "logps/rejected": -1064.0, "loss": 0.2604, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.3125, "rewards/margins": 4.6875, "rewards/rejected": -9.0, "step": 6360 }, { "epoch": 0.4407999446405093, "grad_norm": 25.668744573783595, "learning_rate": 3.43035378922953e-07, "logits/chosen": -3.015625, "logits/rejected": -3.109375, "logps/chosen": -684.0, "logps/rejected": -1056.0, "loss": 0.2614, "rewards/accuracies": 0.90625, "rewards/chosen": -4.90625, "rewards/margins": 4.15625, "rewards/rejected": -9.0625, "step": 6370 }, { "epoch": 0.4414919382741679, "grad_norm": 20.04467227811988, "learning_rate": 3.4247456381686794e-07, "logits/chosen": -2.90625, "logits/rejected": -3.140625, "logps/chosen": -644.0, "logps/rejected": -1080.0, "loss": 0.2877, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.46875, "rewards/margins": 4.625, "rewards/rejected": -9.0625, "step": 6380 }, { "epoch": 0.44218393190782646, "grad_norm": 26.33429128205361, "learning_rate": 3.419132090745367e-07, "logits/chosen": -2.890625, "logits/rejected": -3.125, "logps/chosen": -640.0, "logps/rejected": -1080.0, "loss": 0.2325, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 4.53125, "rewards/rejected": -9.0625, "step": 6390 }, { "epoch": 0.442875925541485, "grad_norm": 22.55962248403195, "learning_rate": 3.413513179717504e-07, "logits/chosen": -2.953125, "logits/rejected": -3.21875, "logps/chosen": -648.0, "logps/rejected": -1088.0, "loss": 0.2263, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.65625, "rewards/margins": 4.5625, "rewards/rejected": -9.1875, "step": 6400 }, { "epoch": 0.4435679191751436, "grad_norm": 28.102361444521527, "learning_rate": 3.407888937874303e-07, "logits/chosen": -3.015625, "logits/rejected": -3.21875, "logps/chosen": -620.0, "logps/rejected": -1128.0, "loss": 0.2873, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 4.875, "rewards/rejected": -9.5, "step": 6410 }, { "epoch": 0.44425991280880217, "grad_norm": 25.260089693973978, "learning_rate": 3.402259398036083e-07, "logits/chosen": -2.890625, "logits/rejected": -3.046875, "logps/chosen": -648.0, "logps/rejected": -1096.0, "loss": 0.2891, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.4375, "rewards/rejected": -9.1875, "step": 6420 }, { "epoch": 0.44495190644246074, "grad_norm": 14.725577766933595, "learning_rate": 3.396624593054081e-07, "logits/chosen": -2.96875, "logits/rejected": -3.09375, "logps/chosen": -644.0, "logps/rejected": -1112.0, "loss": 0.242, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 4.8125, "rewards/rejected": -9.625, "step": 6430 }, { "epoch": 0.4456439000761193, "grad_norm": 26.15400256847697, "learning_rate": 3.390984555810257e-07, "logits/chosen": -2.90625, "logits/rejected": -3.015625, "logps/chosen": -648.0, "logps/rejected": -1096.0, "loss": 0.2928, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.53125, "rewards/rejected": -9.1875, "step": 6440 }, { "epoch": 0.4463358937097779, "grad_norm": 17.997009280440714, "learning_rate": 3.385339319217106e-07, "logits/chosen": -2.84375, "logits/rejected": -3.03125, "logps/chosen": -648.0, "logps/rejected": -1104.0, "loss": 0.2582, "rewards/accuracies": 0.96875, "rewards/chosen": -4.5, "rewards/margins": 4.78125, "rewards/rejected": -9.25, "step": 6450 }, { "epoch": 0.44702788734343646, "grad_norm": 23.556818217456076, "learning_rate": 3.379688916217464e-07, "logits/chosen": -2.828125, "logits/rejected": -2.90625, "logps/chosen": -636.0, "logps/rejected": -1104.0, "loss": 0.2303, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.625, "rewards/margins": 4.84375, "rewards/rejected": -9.4375, "step": 6460 }, { "epoch": 0.44771988097709503, "grad_norm": 29.39337947039767, "learning_rate": 3.374033379784312e-07, "logits/chosen": -2.953125, "logits/rejected": -3.171875, "logps/chosen": -616.0, "logps/rejected": -1080.0, "loss": 0.2865, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.46875, "rewards/margins": 4.625, "rewards/rejected": -9.0625, "step": 6470 }, { "epoch": 0.4484118746107536, "grad_norm": 26.147718871265642, "learning_rate": 3.36837274292059e-07, "logits/chosen": -2.921875, "logits/rejected": -3.203125, "logps/chosen": -652.0, "logps/rejected": -1104.0, "loss": 0.2499, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71875, "rewards/margins": 4.6875, "rewards/rejected": -9.4375, "step": 6480 }, { "epoch": 0.4491038682444122, "grad_norm": 21.409585162292128, "learning_rate": 3.362707038659003e-07, "logits/chosen": -3.0625, "logits/rejected": -3.21875, "logps/chosen": -672.0, "logps/rejected": -1112.0, "loss": 0.2862, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.46875, "rewards/rejected": -9.5, "step": 6490 }, { "epoch": 0.44979586187807075, "grad_norm": 22.141334677795047, "learning_rate": 3.3570363000618234e-07, "logits/chosen": -2.875, "logits/rejected": -3.15625, "logps/chosen": -688.0, "logps/rejected": -1120.0, "loss": 0.2751, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 4.3125, "rewards/rejected": -9.5, "step": 6500 }, { "epoch": 0.4504878555117293, "grad_norm": 28.81571672399864, "learning_rate": 3.351360560220705e-07, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.3046, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 6510 }, { "epoch": 0.4511798491453879, "grad_norm": 25.052094280298842, "learning_rate": 3.345679852256483e-07, "logits/chosen": -3.0625, "logits/rejected": -3.234375, "logps/chosen": -648.0, "logps/rejected": -1080.0, "loss": 0.2771, "rewards/accuracies": 0.9375, "rewards/chosen": -4.6875, "rewards/margins": 4.4375, "rewards/rejected": -9.125, "step": 6520 }, { "epoch": 0.4518718427790464, "grad_norm": 22.880284132252157, "learning_rate": 3.3399942093189873e-07, "logits/chosen": -2.90625, "logits/rejected": -3.171875, "logps/chosen": -608.0, "logps/rejected": -1072.0, "loss": 0.2617, "rewards/accuracies": 0.96875, "rewards/chosen": -4.46875, "rewards/margins": 4.8125, "rewards/rejected": -9.3125, "step": 6530 }, { "epoch": 0.452563836412705, "grad_norm": 24.54514574066971, "learning_rate": 3.3343036645868427e-07, "logits/chosen": -2.984375, "logits/rejected": -3.3125, "logps/chosen": -640.0, "logps/rejected": -1064.0, "loss": 0.3106, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.65625, "rewards/margins": 4.46875, "rewards/rejected": -9.125, "step": 6540 }, { "epoch": 0.45325583004636355, "grad_norm": 19.29391545795956, "learning_rate": 3.3286082512672815e-07, "logits/chosen": -2.875, "logits/rejected": -3.078125, "logps/chosen": -672.0, "logps/rejected": -1088.0, "loss": 0.2624, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.3125, "rewards/rejected": -9.25, "step": 6550 }, { "epoch": 0.4539478236800221, "grad_norm": 18.452119726594734, "learning_rate": 3.3229080025959443e-07, "logits/chosen": -2.890625, "logits/rejected": -3.0625, "logps/chosen": -668.0, "logps/rejected": -1080.0, "loss": 0.2734, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 4.25, "rewards/rejected": -9.125, "step": 6560 }, { "epoch": 0.4546398173136807, "grad_norm": 23.948934945479245, "learning_rate": 3.3172029518366906e-07, "logits/chosen": -2.90625, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1128.0, "loss": 0.2486, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.875, "rewards/rejected": -9.5625, "step": 6570 }, { "epoch": 0.45533181094733927, "grad_norm": 26.334263495889186, "learning_rate": 3.3114931322814017e-07, "logits/chosen": -2.875, "logits/rejected": -3.0625, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.2927, "rewards/accuracies": 0.9375, "rewards/chosen": -4.6875, "rewards/margins": 5.21875, "rewards/rejected": -9.875, "step": 6580 }, { "epoch": 0.45602380458099784, "grad_norm": 20.902736486355455, "learning_rate": 3.3057785772497846e-07, "logits/chosen": -2.890625, "logits/rejected": -3.03125, "logps/chosen": -660.0, "logps/rejected": -1112.0, "loss": 0.2675, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.53125, "rewards/rejected": -9.375, "step": 6590 }, { "epoch": 0.4567157982146564, "grad_norm": 20.16307810411145, "learning_rate": 3.3000593200891856e-07, "logits/chosen": -2.796875, "logits/rejected": -2.796875, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.2679, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.9375, "rewards/rejected": -9.625, "step": 6600 }, { "epoch": 0.457407791848315, "grad_norm": 21.28960141203375, "learning_rate": 3.2943353941743857e-07, "logits/chosen": -3.078125, "logits/rejected": -2.984375, "logps/chosen": -628.0, "logps/rejected": -1072.0, "loss": 0.283, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.59375, "rewards/margins": 4.46875, "rewards/rejected": -9.0625, "step": 6610 }, { "epoch": 0.45809978548197355, "grad_norm": 25.656714839274066, "learning_rate": 3.288606832907412e-07, "logits/chosen": -2.890625, "logits/rejected": -3.078125, "logps/chosen": -628.0, "logps/rejected": -1040.0, "loss": 0.2482, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5625, "rewards/margins": 4.25, "rewards/rejected": -8.8125, "step": 6620 }, { "epoch": 0.4587917791156321, "grad_norm": 24.603570768122285, "learning_rate": 3.2828736697173415e-07, "logits/chosen": -2.953125, "logits/rejected": -3.171875, "logps/chosen": -620.0, "logps/rejected": -1104.0, "loss": 0.2737, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.40625, "rewards/margins": 4.90625, "rewards/rejected": -9.3125, "step": 6630 }, { "epoch": 0.4594837727492907, "grad_norm": 29.545969602889095, "learning_rate": 3.277135938060106e-07, "logits/chosen": -2.875, "logits/rejected": -3.1875, "logps/chosen": -612.0, "logps/rejected": -1104.0, "loss": 0.2282, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.40625, "rewards/margins": 5.0625, "rewards/rejected": -9.5, "step": 6640 }, { "epoch": 0.46017576638294927, "grad_norm": 24.092512561677616, "learning_rate": 3.2713936714182956e-07, "logits/chosen": -2.828125, "logits/rejected": -2.984375, "logps/chosen": -624.0, "logps/rejected": -1152.0, "loss": 0.2807, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.28125, "rewards/rejected": -9.8125, "step": 6650 }, { "epoch": 0.46086776001660784, "grad_norm": 21.818997071425976, "learning_rate": 3.265646903300966e-07, "logits/chosen": -2.78125, "logits/rejected": -3.15625, "logps/chosen": -600.0, "logps/rejected": -1088.0, "loss": 0.3082, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.3125, "rewards/margins": 4.96875, "rewards/rejected": -9.3125, "step": 6660 }, { "epoch": 0.4615597536502664, "grad_norm": 27.37613869337617, "learning_rate": 3.259895667243439e-07, "logits/chosen": -2.703125, "logits/rejected": -2.78125, "logps/chosen": -608.0, "logps/rejected": -1088.0, "loss": 0.2455, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.3125, "rewards/margins": 4.53125, "rewards/rejected": -8.875, "step": 6670 }, { "epoch": 0.462251747283925, "grad_norm": 32.6628357267371, "learning_rate": 3.2541399968071115e-07, "logits/chosen": -2.78125, "logits/rejected": -2.875, "logps/chosen": -640.0, "logps/rejected": -1048.0, "loss": 0.288, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.65625, "rewards/margins": 4.15625, "rewards/rejected": -8.8125, "step": 6680 }, { "epoch": 0.46294374091758356, "grad_norm": 23.803773228459434, "learning_rate": 3.248379925579255e-07, "logits/chosen": -2.78125, "logits/rejected": -2.8125, "logps/chosen": -632.0, "logps/rejected": -1104.0, "loss": 0.2702, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.46875, "rewards/margins": 4.8125, "rewards/rejected": -9.3125, "step": 6690 }, { "epoch": 0.46363573455124213, "grad_norm": 17.595124295468864, "learning_rate": 3.242615487172824e-07, "logits/chosen": -2.609375, "logits/rejected": -2.71875, "logps/chosen": -624.0, "logps/rejected": -1056.0, "loss": 0.2768, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.4375, "rewards/margins": 4.53125, "rewards/rejected": -9.0, "step": 6700 }, { "epoch": 0.4643277281849007, "grad_norm": 19.203748920882735, "learning_rate": 3.236846715226257e-07, "logits/chosen": -2.5625, "logits/rejected": -2.828125, "logps/chosen": -640.0, "logps/rejected": -1064.0, "loss": 0.2594, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.46875, "rewards/margins": 4.375, "rewards/rejected": -8.8125, "step": 6710 }, { "epoch": 0.4650197218185593, "grad_norm": 24.423628762777444, "learning_rate": 3.2310736434032794e-07, "logits/chosen": -2.78125, "logits/rejected": -2.921875, "logps/chosen": -648.0, "logps/rejected": -1080.0, "loss": 0.2374, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.53125, "rewards/rejected": -9.25, "step": 6720 }, { "epoch": 0.46571171545221784, "grad_norm": 22.286066881007546, "learning_rate": 3.225296305392712e-07, "logits/chosen": -2.84375, "logits/rejected": -3.0, "logps/chosen": -648.0, "logps/rejected": -1080.0, "loss": 0.2986, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.3125, "rewards/rejected": -9.125, "step": 6730 }, { "epoch": 0.4664037090858764, "grad_norm": 22.839938191137527, "learning_rate": 3.219514734908265e-07, "logits/chosen": -2.875, "logits/rejected": -3.21875, "logps/chosen": -648.0, "logps/rejected": -1072.0, "loss": 0.3063, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.75, "rewards/margins": 4.46875, "rewards/rejected": -9.1875, "step": 6740 }, { "epoch": 0.467095702719535, "grad_norm": 19.20443047192453, "learning_rate": 3.2137289656883556e-07, "logits/chosen": -2.859375, "logits/rejected": -2.96875, "logps/chosen": -652.0, "logps/rejected": -1080.0, "loss": 0.2637, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.71875, "rewards/margins": 4.375, "rewards/rejected": -9.125, "step": 6750 }, { "epoch": 0.46778769635319356, "grad_norm": 17.49418890657944, "learning_rate": 3.2079390314958943e-07, "logits/chosen": -2.78125, "logits/rejected": -2.9375, "logps/chosen": -600.0, "logps/rejected": -1032.0, "loss": 0.3059, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.28125, "rewards/margins": 4.5, "rewards/rejected": -8.75, "step": 6760 }, { "epoch": 0.46847968998685213, "grad_norm": 17.076421519736265, "learning_rate": 3.2021449661181035e-07, "logits/chosen": -2.796875, "logits/rejected": -2.90625, "logps/chosen": -636.0, "logps/rejected": -1104.0, "loss": 0.2437, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 4.75, "rewards/rejected": -9.375, "step": 6770 }, { "epoch": 0.4691716836205107, "grad_norm": 17.335281286429925, "learning_rate": 3.1963468033663063e-07, "logits/chosen": -2.9375, "logits/rejected": -3.109375, "logps/chosen": -620.0, "logps/rejected": -1048.0, "loss": 0.2401, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5, "rewards/margins": 4.3125, "rewards/rejected": -8.8125, "step": 6780 }, { "epoch": 0.4698636772541693, "grad_norm": 16.6640831993246, "learning_rate": 3.1905445770757443e-07, "logits/chosen": -2.765625, "logits/rejected": -2.734375, "logps/chosen": -620.0, "logps/rejected": -1080.0, "loss": 0.2757, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.375, "rewards/margins": 4.53125, "rewards/rejected": -8.9375, "step": 6790 }, { "epoch": 0.47055567088782785, "grad_norm": 21.59747609476279, "learning_rate": 3.1847383211053626e-07, "logits/chosen": -2.875, "logits/rejected": -3.125, "logps/chosen": -640.0, "logps/rejected": -1096.0, "loss": 0.254, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.65625, "rewards/margins": 4.65625, "rewards/rejected": -9.3125, "step": 6800 }, { "epoch": 0.4712476645214864, "grad_norm": 15.632650014937916, "learning_rate": 3.17892806933763e-07, "logits/chosen": -2.96875, "logits/rejected": -3.109375, "logps/chosen": -612.0, "logps/rejected": -1072.0, "loss": 0.2785, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.46875, "rewards/margins": 4.71875, "rewards/rejected": -9.1875, "step": 6810 }, { "epoch": 0.471939658155145, "grad_norm": 18.75243042101128, "learning_rate": 3.1731138556783265e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0, "logps/chosen": -628.0, "logps/rejected": -1120.0, "loss": 0.2466, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 4.96875, "rewards/rejected": -9.5625, "step": 6820 }, { "epoch": 0.47263165178880356, "grad_norm": 25.29754454211128, "learning_rate": 3.167295714056356e-07, "logits/chosen": -2.90625, "logits/rejected": -3.09375, "logps/chosen": -664.0, "logps/rejected": -1120.0, "loss": 0.2698, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.59375, "rewards/rejected": -9.5, "step": 6830 }, { "epoch": 0.47332364542246214, "grad_norm": 21.681069195866865, "learning_rate": 3.161473678423541e-07, "logits/chosen": -2.828125, "logits/rejected": -3.171875, "logps/chosen": -636.0, "logps/rejected": -1120.0, "loss": 0.2738, "rewards/accuracies": 0.9375, "rewards/chosen": -4.6875, "rewards/margins": 4.9375, "rewards/rejected": -9.625, "step": 6840 }, { "epoch": 0.4740156390561207, "grad_norm": 16.232719682741962, "learning_rate": 3.15564778275443e-07, "logits/chosen": -2.78125, "logits/rejected": -3.15625, "logps/chosen": -632.0, "logps/rejected": -1048.0, "loss": 0.2162, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.4375, "rewards/margins": 4.46875, "rewards/rejected": -8.9375, "step": 6850 }, { "epoch": 0.4747076326897793, "grad_norm": 15.668735035897713, "learning_rate": 3.1498180610460954e-07, "logits/chosen": -2.9375, "logits/rejected": -3.234375, "logps/chosen": -616.0, "logps/rejected": -1096.0, "loss": 0.1997, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.40625, "rewards/margins": 4.96875, "rewards/rejected": -9.375, "step": 6860 }, { "epoch": 0.47539962632343785, "grad_norm": 27.806608196153412, "learning_rate": 3.143984547317937e-07, "logits/chosen": -2.828125, "logits/rejected": -3.09375, "logps/chosen": -636.0, "logps/rejected": -1128.0, "loss": 0.2656, "rewards/accuracies": 0.96875, "rewards/chosen": -4.59375, "rewards/margins": 5.03125, "rewards/rejected": -9.625, "step": 6870 }, { "epoch": 0.47609161995709637, "grad_norm": 26.47829827860009, "learning_rate": 3.1381472756114823e-07, "logits/chosen": -2.796875, "logits/rejected": -2.9375, "logps/chosen": -612.0, "logps/rejected": -1072.0, "loss": 0.2919, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.375, "rewards/margins": 4.65625, "rewards/rejected": -9.0625, "step": 6880 }, { "epoch": 0.47678361359075494, "grad_norm": 13.473854598888687, "learning_rate": 3.132306279990189e-07, "logits/chosen": -2.875, "logits/rejected": -2.96875, "logps/chosen": -644.0, "logps/rejected": -1080.0, "loss": 0.2637, "rewards/accuracies": 0.96875, "rewards/chosen": -4.59375, "rewards/margins": 4.5, "rewards/rejected": -9.125, "step": 6890 }, { "epoch": 0.4774756072244135, "grad_norm": 22.043614634707, "learning_rate": 3.126461594539247e-07, "logits/chosen": -2.765625, "logits/rejected": -3.15625, "logps/chosen": -648.0, "logps/rejected": -1064.0, "loss": 0.2646, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 4.4375, "rewards/rejected": -9.1875, "step": 6900 }, { "epoch": 0.4781676008580721, "grad_norm": 18.95129783680489, "learning_rate": 3.1206132533653746e-07, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -644.0, "logps/rejected": -1064.0, "loss": 0.2398, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 4.34375, "rewards/rejected": -9.0625, "step": 6910 }, { "epoch": 0.47885959449173066, "grad_norm": 35.867900670829094, "learning_rate": 3.114761290596628e-07, "logits/chosen": -2.859375, "logits/rejected": -3.1875, "logps/chosen": -632.0, "logps/rejected": -1096.0, "loss": 0.2702, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.46875, "rewards/margins": 4.8125, "rewards/rejected": -9.3125, "step": 6920 }, { "epoch": 0.4795515881253892, "grad_norm": 26.907108940509207, "learning_rate": 3.1089057403821926e-07, "logits/chosen": -2.875, "logits/rejected": -3.078125, "logps/chosen": -652.0, "logps/rejected": -1112.0, "loss": 0.2416, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.59375, "rewards/rejected": -9.3125, "step": 6930 }, { "epoch": 0.4802435817590478, "grad_norm": 15.636828408990194, "learning_rate": 3.1030466368921916e-07, "logits/chosen": -2.921875, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1112.0, "loss": 0.2296, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 4.6875, "rewards/rejected": -9.5625, "step": 6940 }, { "epoch": 0.48093557539270637, "grad_norm": 25.675809563206634, "learning_rate": 3.097184014317482e-07, "logits/chosen": -2.875, "logits/rejected": -3.234375, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2369, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 6950 }, { "epoch": 0.48162756902636494, "grad_norm": 17.320862464731825, "learning_rate": 3.091317906869458e-07, "logits/chosen": -2.734375, "logits/rejected": -3.078125, "logps/chosen": -624.0, "logps/rejected": -1080.0, "loss": 0.2777, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.46875, "rewards/margins": 4.84375, "rewards/rejected": -9.3125, "step": 6960 }, { "epoch": 0.4823195626600235, "grad_norm": 23.825326482977545, "learning_rate": 3.085448348779846e-07, "logits/chosen": -2.96875, "logits/rejected": -3.03125, "logps/chosen": -632.0, "logps/rejected": -1096.0, "loss": 0.2816, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.5625, "rewards/margins": 4.8125, "rewards/rejected": -9.375, "step": 6970 }, { "epoch": 0.4830115562936821, "grad_norm": 17.953536428577618, "learning_rate": 3.0795753743005136e-07, "logits/chosen": -2.921875, "logits/rejected": -3.25, "logps/chosen": -652.0, "logps/rejected": -1104.0, "loss": 0.2225, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 4.9375, "rewards/rejected": -9.5625, "step": 6980 }, { "epoch": 0.48370354992734066, "grad_norm": 17.506830429443905, "learning_rate": 3.073699017703261e-07, "logits/chosen": -2.953125, "logits/rejected": -3.078125, "logps/chosen": -684.0, "logps/rejected": -1096.0, "loss": 0.2679, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.34375, "rewards/rejected": -9.3125, "step": 6990 }, { "epoch": 0.48439554356099923, "grad_norm": 23.970019660403597, "learning_rate": 3.0678193132796265e-07, "logits/chosen": -2.890625, "logits/rejected": -3.15625, "logps/chosen": -664.0, "logps/rejected": -1032.0, "loss": 0.2946, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.875, "rewards/margins": 3.921875, "rewards/rejected": -8.8125, "step": 7000 }, { "epoch": 0.48439554356099923, "eval_logits/chosen": -2.84375, "eval_logits/rejected": -3.046875, "eval_logps/chosen": -676.0, "eval_logps/rejected": -1056.0, "eval_loss": 0.23060083389282227, "eval_rewards/accuracies": 0.8963374495506287, "eval_rewards/chosen": -4.875, "eval_rewards/margins": 4.03125, "eval_rewards/rejected": -8.9375, "eval_runtime": 2938.268, "eval_samples_per_second": 33.296, "eval_steps_per_second": 0.52, "step": 7000 }, { "epoch": 0.4850875371946578, "grad_norm": 17.166766216881282, "learning_rate": 3.0619362953406846e-07, "logits/chosen": -2.890625, "logits/rejected": -3.21875, "logps/chosen": -632.0, "logps/rejected": -1104.0, "loss": 0.2501, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.59375, "rewards/margins": 4.9375, "rewards/rejected": -9.5, "step": 7010 }, { "epoch": 0.4857795308283164, "grad_norm": 17.715711906353604, "learning_rate": 3.0560499982168455e-07, "logits/chosen": -2.828125, "logits/rejected": -3.046875, "logps/chosen": -648.0, "logps/rejected": -1176.0, "loss": 0.2553, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.3125, "rewards/rejected": -10.125, "step": 7020 }, { "epoch": 0.48647152446197495, "grad_norm": 41.9169442013972, "learning_rate": 3.0501604562576546e-07, "logits/chosen": -2.828125, "logits/rejected": -3.15625, "logps/chosen": -636.0, "logps/rejected": -1096.0, "loss": 0.2373, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.65625, "rewards/margins": 4.78125, "rewards/rejected": -9.4375, "step": 7030 }, { "epoch": 0.4871635180956335, "grad_norm": 21.71906529756231, "learning_rate": 3.044267703831594e-07, "logits/chosen": -2.84375, "logits/rejected": -3.09375, "logps/chosen": -644.0, "logps/rejected": -1072.0, "loss": 0.2339, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.6875, "rewards/margins": 4.53125, "rewards/rejected": -9.1875, "step": 7040 }, { "epoch": 0.4878555117292921, "grad_norm": 29.352093886987653, "learning_rate": 3.03837177532588e-07, "logits/chosen": -2.734375, "logits/rejected": -2.859375, "logps/chosen": -632.0, "logps/rejected": -1168.0, "loss": 0.2228, "rewards/accuracies": 0.96875, "rewards/chosen": -4.5, "rewards/margins": 5.1875, "rewards/rejected": -9.6875, "step": 7050 }, { "epoch": 0.48854750536295066, "grad_norm": 19.55485232884723, "learning_rate": 3.0324727051462605e-07, "logits/chosen": -2.75, "logits/rejected": -2.859375, "logps/chosen": -620.0, "logps/rejected": -1088.0, "loss": 0.2702, "rewards/accuracies": 0.90625, "rewards/chosen": -4.4375, "rewards/margins": 4.71875, "rewards/rejected": -9.125, "step": 7060 }, { "epoch": 0.48923949899660923, "grad_norm": 23.209421744964626, "learning_rate": 3.0265705277168215e-07, "logits/chosen": -2.828125, "logits/rejected": -2.90625, "logps/chosen": -584.0, "logps/rejected": -1088.0, "loss": 0.2495, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.1875, "rewards/margins": 5.03125, "rewards/rejected": -9.1875, "step": 7070 }, { "epoch": 0.4899314926302678, "grad_norm": 19.431070304640237, "learning_rate": 3.020665277479777e-07, "logits/chosen": -2.859375, "logits/rejected": -2.8125, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.229, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 4.75, "rewards/rejected": -9.5, "step": 7080 }, { "epoch": 0.4906234862639264, "grad_norm": 21.154101770950668, "learning_rate": 3.0147569888952746e-07, "logits/chosen": -2.84375, "logits/rejected": -2.828125, "logps/chosen": -636.0, "logps/rejected": -1096.0, "loss": 0.2755, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.53125, "rewards/margins": 4.59375, "rewards/rejected": -9.125, "step": 7090 }, { "epoch": 0.49131547989758495, "grad_norm": 22.779800343363267, "learning_rate": 3.008845696441192e-07, "logits/chosen": -2.78125, "logits/rejected": -3.09375, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.2523, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 4.8125, "rewards/rejected": -9.625, "step": 7100 }, { "epoch": 0.4920074735312435, "grad_norm": 25.63933984914035, "learning_rate": 3.002931434612935e-07, "logits/chosen": -2.703125, "logits/rejected": -3.046875, "logps/chosen": -648.0, "logps/rejected": -1040.0, "loss": 0.3447, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.15625, "rewards/rejected": -8.8125, "step": 7110 }, { "epoch": 0.4926994671649021, "grad_norm": 13.481600023804253, "learning_rate": 2.997014237923238e-07, "logits/chosen": -2.796875, "logits/rejected": -2.96875, "logps/chosen": -632.0, "logps/rejected": -1128.0, "loss": 0.2185, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.46875, "rewards/margins": 5.03125, "rewards/rejected": -9.5, "step": 7120 }, { "epoch": 0.49339146079856067, "grad_norm": 27.535115788297194, "learning_rate": 2.9910941409019633e-07, "logits/chosen": -2.828125, "logits/rejected": -2.921875, "logps/chosen": -608.0, "logps/rejected": -1088.0, "loss": 0.2522, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.4375, "rewards/margins": 4.8125, "rewards/rejected": -9.25, "step": 7130 }, { "epoch": 0.49408345443221924, "grad_norm": 18.673156640392282, "learning_rate": 2.9851711780958946e-07, "logits/chosen": -2.75, "logits/rejected": -2.921875, "logps/chosen": -620.0, "logps/rejected": -1072.0, "loss": 0.3013, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.4375, "rewards/margins": 4.6875, "rewards/rejected": -9.125, "step": 7140 }, { "epoch": 0.4947754480658778, "grad_norm": 16.450134909043012, "learning_rate": 2.979245384068541e-07, "logits/chosen": -2.765625, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1096.0, "loss": 0.2488, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.875, "rewards/margins": 4.4375, "rewards/rejected": -9.3125, "step": 7150 }, { "epoch": 0.4954674416995364, "grad_norm": 38.74342352742944, "learning_rate": 2.9733167933999345e-07, "logits/chosen": -2.703125, "logits/rejected": -2.9375, "logps/chosen": -680.0, "logps/rejected": -1128.0, "loss": 0.2464, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 4.8125, "rewards/rejected": -9.5625, "step": 7160 }, { "epoch": 0.49615943533319495, "grad_norm": 24.529535689496164, "learning_rate": 2.967385440686424e-07, "logits/chosen": -2.90625, "logits/rejected": -3.015625, "logps/chosen": -668.0, "logps/rejected": -1128.0, "loss": 0.2643, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 4.71875, "rewards/rejected": -9.6875, "step": 7170 }, { "epoch": 0.4968514289668535, "grad_norm": 18.701408525499303, "learning_rate": 2.9614513605404785e-07, "logits/chosen": -2.984375, "logits/rejected": -3.109375, "logps/chosen": -616.0, "logps/rejected": -1112.0, "loss": 0.2295, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.5625, "rewards/margins": 4.96875, "rewards/rejected": -9.5625, "step": 7180 }, { "epoch": 0.4975434226005121, "grad_norm": 21.78533214868208, "learning_rate": 2.9555145875904826e-07, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -652.0, "logps/rejected": -1088.0, "loss": 0.2504, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.6875, "rewards/margins": 4.6875, "rewards/rejected": -9.375, "step": 7190 }, { "epoch": 0.49823541623417067, "grad_norm": 22.02979143689461, "learning_rate": 2.949575156480534e-07, "logits/chosen": -2.734375, "logits/rejected": -3.125, "logps/chosen": -628.0, "logps/rejected": -1096.0, "loss": 0.2273, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.5, "rewards/margins": 4.875, "rewards/rejected": -9.375, "step": 7200 }, { "epoch": 0.49892740986782924, "grad_norm": 32.4814623849612, "learning_rate": 2.9436331018702433e-07, "logits/chosen": -2.90625, "logits/rejected": -3.03125, "logps/chosen": -616.0, "logps/rejected": -1104.0, "loss": 0.2268, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.40625, "rewards/margins": 4.9375, "rewards/rejected": -9.3125, "step": 7210 }, { "epoch": 0.4996194035014878, "grad_norm": 25.202528123900734, "learning_rate": 2.937688458434529e-07, "logits/chosen": -2.921875, "logits/rejected": -3.0625, "logps/chosen": -652.0, "logps/rejected": -1112.0, "loss": 0.2621, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.6875, "rewards/rejected": -9.5, "step": 7220 }, { "epoch": 0.5003113971351464, "grad_norm": 17.354675667246912, "learning_rate": 2.9317412608634173e-07, "logits/chosen": -2.84375, "logits/rejected": -2.828125, "logps/chosen": -648.0, "logps/rejected": -1120.0, "loss": 0.2511, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 4.8125, "rewards/rejected": -9.625, "step": 7230 }, { "epoch": 0.501003390768805, "grad_norm": 22.132780600151584, "learning_rate": 2.925791543861841e-07, "logits/chosen": -3.0, "logits/rejected": -3.1875, "logps/chosen": -620.0, "logps/rejected": -1128.0, "loss": 0.2654, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.5, "rewards/margins": 5.0625, "rewards/rejected": -9.5625, "step": 7240 }, { "epoch": 0.5016953844024635, "grad_norm": 26.512224651010733, "learning_rate": 2.9198393421494327e-07, "logits/chosen": -2.828125, "logits/rejected": -2.953125, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.3066, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.5625, "rewards/rejected": -9.375, "step": 7250 }, { "epoch": 0.5023873780361221, "grad_norm": 18.8138964304021, "learning_rate": 2.913884690460325e-07, "logits/chosen": -2.859375, "logits/rejected": -2.984375, "logps/chosen": -640.0, "logps/rejected": -1136.0, "loss": 0.2173, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.0, "rewards/rejected": -9.6875, "step": 7260 }, { "epoch": 0.5030793716697807, "grad_norm": 15.881667928049684, "learning_rate": 2.907927623542947e-07, "logits/chosen": -2.734375, "logits/rejected": -2.984375, "logps/chosen": -632.0, "logps/rejected": -1136.0, "loss": 0.2618, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.4375, "rewards/margins": 5.125, "rewards/rejected": -9.5625, "step": 7270 }, { "epoch": 0.5037713653034392, "grad_norm": 28.078813945747658, "learning_rate": 2.9019681761598246e-07, "logits/chosen": -2.6875, "logits/rejected": -2.96875, "logps/chosen": -624.0, "logps/rejected": -1120.0, "loss": 0.2658, "rewards/accuracies": 0.96875, "rewards/chosen": -4.375, "rewards/margins": 5.09375, "rewards/rejected": -9.5, "step": 7280 }, { "epoch": 0.5044633589370978, "grad_norm": 19.591348239893396, "learning_rate": 2.8960063830873703e-07, "logits/chosen": -2.875, "logits/rejected": -3.140625, "logps/chosen": -576.0, "logps/rejected": -1040.0, "loss": 0.235, "rewards/accuracies": 0.96875, "rewards/chosen": -4.09375, "rewards/margins": 4.71875, "rewards/rejected": -8.8125, "step": 7290 }, { "epoch": 0.5051553525707564, "grad_norm": 20.484263358187416, "learning_rate": 2.8900422791156896e-07, "logits/chosen": -2.875, "logits/rejected": -3.03125, "logps/chosen": -620.0, "logps/rejected": -1072.0, "loss": 0.2431, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75, "rewards/margins": 4.4375, "rewards/rejected": -9.1875, "step": 7300 }, { "epoch": 0.505847346204415, "grad_norm": 21.574305101294286, "learning_rate": 2.88407589904837e-07, "logits/chosen": -2.765625, "logits/rejected": -2.96875, "logps/chosen": -640.0, "logps/rejected": -1144.0, "loss": 0.2424, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.46875, "rewards/margins": 5.0625, "rewards/rejected": -9.5, "step": 7310 }, { "epoch": 0.5065393398380735, "grad_norm": 28.462134796168755, "learning_rate": 2.878107277702283e-07, "logits/chosen": -2.890625, "logits/rejected": -3.1875, "logps/chosen": -624.0, "logps/rejected": -1088.0, "loss": 0.2365, "rewards/accuracies": 0.96875, "rewards/chosen": -4.4375, "rewards/margins": 4.9375, "rewards/rejected": -9.375, "step": 7320 }, { "epoch": 0.5072313334717321, "grad_norm": 19.463381600564343, "learning_rate": 2.872136449907377e-07, "logits/chosen": -2.8125, "logits/rejected": -2.984375, "logps/chosen": -628.0, "logps/rejected": -1088.0, "loss": 0.2545, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.34375, "rewards/margins": 4.90625, "rewards/rejected": -9.25, "step": 7330 }, { "epoch": 0.5079233271053907, "grad_norm": 24.8988987509621, "learning_rate": 2.8661634505064795e-07, "logits/chosen": -2.953125, "logits/rejected": -2.890625, "logps/chosen": -664.0, "logps/rejected": -1176.0, "loss": 0.2686, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.09375, "rewards/rejected": -9.8125, "step": 7340 }, { "epoch": 0.5086153207390492, "grad_norm": 18.022915689408894, "learning_rate": 2.860188314355088e-07, "logits/chosen": -2.78125, "logits/rejected": -2.921875, "logps/chosen": -660.0, "logps/rejected": -1096.0, "loss": 0.2342, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 4.5625, "rewards/rejected": -9.25, "step": 7350 }, { "epoch": 0.5093073143727078, "grad_norm": 19.743263819629167, "learning_rate": 2.8542110763211683e-07, "logits/chosen": -2.984375, "logits/rejected": -3.140625, "logps/chosen": -632.0, "logps/rejected": -1072.0, "loss": 0.2429, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.53125, "rewards/margins": 4.65625, "rewards/rejected": -9.1875, "step": 7360 }, { "epoch": 0.5099993080063664, "grad_norm": 20.569625399180378, "learning_rate": 2.848231771284955e-07, "logits/chosen": -2.796875, "logits/rejected": -3.046875, "logps/chosen": -652.0, "logps/rejected": -1072.0, "loss": 0.2531, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.65625, "rewards/margins": 4.40625, "rewards/rejected": -9.0625, "step": 7370 }, { "epoch": 0.510691301640025, "grad_norm": 27.542547723797888, "learning_rate": 2.84225043413874e-07, "logits/chosen": -2.890625, "logits/rejected": -2.953125, "logps/chosen": -652.0, "logps/rejected": -1128.0, "loss": 0.2782, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.71875, "rewards/margins": 4.625, "rewards/rejected": -9.375, "step": 7380 }, { "epoch": 0.5113832952736835, "grad_norm": 40.87390732052686, "learning_rate": 2.83626709978668e-07, "logits/chosen": -2.90625, "logits/rejected": -3.1875, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.2702, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 7390 }, { "epoch": 0.5120752889073421, "grad_norm": 21.37345469482736, "learning_rate": 2.8302818031445794e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0625, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.2692, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 4.90625, "rewards/rejected": -9.9375, "step": 7400 }, { "epoch": 0.5127672825410006, "grad_norm": 28.320638726636762, "learning_rate": 2.824294579139699e-07, "logits/chosen": -2.90625, "logits/rejected": -3.21875, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2614, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75, "rewards/margins": 5.0, "rewards/rejected": -9.75, "step": 7410 }, { "epoch": 0.5134592761746591, "grad_norm": 20.416138684330182, "learning_rate": 2.818305462710543e-07, "logits/chosen": -2.859375, "logits/rejected": -2.9375, "logps/chosen": -620.0, "logps/rejected": -1088.0, "loss": 0.2494, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 4.5625, "rewards/rejected": -9.125, "step": 7420 }, { "epoch": 0.5141512698083177, "grad_norm": 30.753747254212378, "learning_rate": 2.812314488806662e-07, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -652.0, "logps/rejected": -1056.0, "loss": 0.2632, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.71875, "rewards/margins": 4.25, "rewards/rejected": -9.0, "step": 7430 }, { "epoch": 0.5148432634419763, "grad_norm": 34.09173713744467, "learning_rate": 2.8063216923884426e-07, "logits/chosen": -2.8125, "logits/rejected": -3.0625, "logps/chosen": -640.0, "logps/rejected": -1112.0, "loss": 0.2586, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.65625, "rewards/margins": 4.71875, "rewards/rejected": -9.375, "step": 7440 }, { "epoch": 0.5155352570756349, "grad_norm": 29.447223075530964, "learning_rate": 2.800327108426911e-07, "logits/chosen": -2.828125, "logits/rejected": -3.078125, "logps/chosen": -628.0, "logps/rejected": -1104.0, "loss": 0.2512, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5, "rewards/margins": 4.875, "rewards/rejected": -9.375, "step": 7450 }, { "epoch": 0.5162272507092934, "grad_norm": 21.448309408988873, "learning_rate": 2.7943307719035197e-07, "logits/chosen": -2.890625, "logits/rejected": -3.15625, "logps/chosen": -660.0, "logps/rejected": -1088.0, "loss": 0.2295, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 4.40625, "rewards/rejected": -9.1875, "step": 7460 }, { "epoch": 0.516919244342952, "grad_norm": 24.77645084612515, "learning_rate": 2.7883327178099526e-07, "logits/chosen": -2.875, "logits/rejected": -2.890625, "logps/chosen": -616.0, "logps/rejected": -1080.0, "loss": 0.2416, "rewards/accuracies": 0.96875, "rewards/chosen": -4.3125, "rewards/margins": 4.71875, "rewards/rejected": -9.0625, "step": 7470 }, { "epoch": 0.5176112379766106, "grad_norm": 21.281691571176285, "learning_rate": 2.782332981147913e-07, "logits/chosen": -2.796875, "logits/rejected": -3.109375, "logps/chosen": -608.0, "logps/rejected": -1064.0, "loss": 0.25, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.40625, "rewards/margins": 4.78125, "rewards/rejected": -9.1875, "step": 7480 }, { "epoch": 0.5183032316102691, "grad_norm": 23.77604110918201, "learning_rate": 2.776331596928926e-07, "logits/chosen": -2.984375, "logits/rejected": -3.109375, "logps/chosen": -632.0, "logps/rejected": -1080.0, "loss": 0.2287, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.5625, "rewards/rejected": -9.25, "step": 7490 }, { "epoch": 0.5189952252439277, "grad_norm": 25.097653474997102, "learning_rate": 2.7703286001741276e-07, "logits/chosen": -3.015625, "logits/rejected": -3.328125, "logps/chosen": -660.0, "logps/rejected": -1096.0, "loss": 0.2487, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.4375, "rewards/rejected": -9.3125, "step": 7500 }, { "epoch": 0.5196872188775863, "grad_norm": 17.845877946882606, "learning_rate": 2.7643240259140674e-07, "logits/chosen": -2.828125, "logits/rejected": -3.125, "logps/chosen": -632.0, "logps/rejected": -1112.0, "loss": 0.2288, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.53125, "rewards/margins": 4.90625, "rewards/rejected": -9.4375, "step": 7510 }, { "epoch": 0.5203792125112449, "grad_norm": 31.877748877173026, "learning_rate": 2.758317909188499e-07, "logits/chosen": -2.953125, "logits/rejected": -3.203125, "logps/chosen": -620.0, "logps/rejected": -1064.0, "loss": 0.2749, "rewards/accuracies": 0.9375, "rewards/chosen": -4.5625, "rewards/margins": 4.625, "rewards/rejected": -9.1875, "step": 7520 }, { "epoch": 0.5210712061449034, "grad_norm": 20.417298875222716, "learning_rate": 2.7523102850461753e-07, "logits/chosen": -2.78125, "logits/rejected": -3.03125, "logps/chosen": -648.0, "logps/rejected": -1096.0, "loss": 0.2236, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.46875, "rewards/margins": 4.84375, "rewards/rejected": -9.3125, "step": 7530 }, { "epoch": 0.521763199778562, "grad_norm": 17.055991994704087, "learning_rate": 2.7463011885446494e-07, "logits/chosen": -2.875, "logits/rejected": -3.015625, "logps/chosen": -648.0, "logps/rejected": -1120.0, "loss": 0.2533, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.6875, "rewards/margins": 4.8125, "rewards/rejected": -9.5, "step": 7540 }, { "epoch": 0.5224551934122206, "grad_norm": 32.50111976505532, "learning_rate": 2.740290654750062e-07, "logits/chosen": -2.90625, "logits/rejected": -2.9375, "logps/chosen": -636.0, "logps/rejected": -1096.0, "loss": 0.2559, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.59375, "rewards/margins": 4.84375, "rewards/rejected": -9.4375, "step": 7550 }, { "epoch": 0.5231471870458791, "grad_norm": 27.14902684357599, "learning_rate": 2.734278718736945e-07, "logits/chosen": -2.859375, "logits/rejected": -2.90625, "logps/chosen": -652.0, "logps/rejected": -1104.0, "loss": 0.2914, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.71875, "rewards/rejected": -9.375, "step": 7560 }, { "epoch": 0.5238391806795377, "grad_norm": 17.90118186894348, "learning_rate": 2.728265415588011e-07, "logits/chosen": -2.78125, "logits/rejected": -2.921875, "logps/chosen": -632.0, "logps/rejected": -1088.0, "loss": 0.2656, "rewards/accuracies": 0.9375, "rewards/chosen": -4.4375, "rewards/margins": 4.78125, "rewards/rejected": -9.1875, "step": 7570 }, { "epoch": 0.5245311743131963, "grad_norm": 24.075876837520358, "learning_rate": 2.722250780393951e-07, "logits/chosen": -2.71875, "logits/rejected": -3.0625, "logps/chosen": -588.0, "logps/rejected": -1064.0, "loss": 0.2287, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.125, "rewards/margins": 5.0, "rewards/rejected": -9.125, "step": 7580 }, { "epoch": 0.5252231679468549, "grad_norm": 19.537716011347598, "learning_rate": 2.716234848253229e-07, "logits/chosen": -2.734375, "logits/rejected": -2.875, "logps/chosen": -644.0, "logps/rejected": -1088.0, "loss": 0.2484, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.59375, "rewards/margins": 4.65625, "rewards/rejected": -9.25, "step": 7590 }, { "epoch": 0.5259151615805134, "grad_norm": 20.740976681534395, "learning_rate": 2.710217654271878e-07, "logits/chosen": -2.9375, "logits/rejected": -3.234375, "logps/chosen": -648.0, "logps/rejected": -1104.0, "loss": 0.2399, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.65625, "rewards/rejected": -9.4375, "step": 7600 }, { "epoch": 0.526607155214172, "grad_norm": 21.085222867800265, "learning_rate": 2.704199233563292e-07, "logits/chosen": -2.875, "logits/rejected": -3.21875, "logps/chosen": -684.0, "logps/rejected": -1144.0, "loss": 0.2836, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.84375, "rewards/rejected": -9.875, "step": 7610 }, { "epoch": 0.5272991488478306, "grad_norm": 17.006642730389096, "learning_rate": 2.6981796212480277e-07, "logits/chosen": -2.9375, "logits/rejected": -3.25, "logps/chosen": -676.0, "logps/rejected": -1080.0, "loss": 0.2578, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.25, "rewards/rejected": -9.25, "step": 7620 }, { "epoch": 0.5279911424814892, "grad_norm": 19.541455116962723, "learning_rate": 2.6921588524535927e-07, "logits/chosen": -2.734375, "logits/rejected": -3.09375, "logps/chosen": -672.0, "logps/rejected": -1088.0, "loss": 0.2416, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.65625, "rewards/rejected": -9.3125, "step": 7630 }, { "epoch": 0.5286831361151477, "grad_norm": 19.106797443740597, "learning_rate": 2.686136962314243e-07, "logits/chosen": -2.953125, "logits/rejected": -3.21875, "logps/chosen": -652.0, "logps/rejected": -1112.0, "loss": 0.2724, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.625, "rewards/rejected": -9.5, "step": 7640 }, { "epoch": 0.5293751297488063, "grad_norm": 22.12203066566955, "learning_rate": 2.6801139859707804e-07, "logits/chosen": -2.984375, "logits/rejected": -3.125, "logps/chosen": -664.0, "logps/rejected": -1112.0, "loss": 0.2778, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.5625, "rewards/rejected": -9.5, "step": 7650 }, { "epoch": 0.5300671233824649, "grad_norm": 14.382683366660181, "learning_rate": 2.6740899585703423e-07, "logits/chosen": -2.828125, "logits/rejected": -3.09375, "logps/chosen": -680.0, "logps/rejected": -1136.0, "loss": 0.228, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.625, "rewards/rejected": -9.625, "step": 7660 }, { "epoch": 0.5307591170161234, "grad_norm": 13.782350039320155, "learning_rate": 2.668064915266202e-07, "logits/chosen": -2.96875, "logits/rejected": -3.21875, "logps/chosen": -652.0, "logps/rejected": -1104.0, "loss": 0.2946, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 4.6875, "rewards/rejected": -9.5625, "step": 7670 }, { "epoch": 0.531451110649782, "grad_norm": 24.672358048821415, "learning_rate": 2.662038891217561e-07, "logits/chosen": -2.90625, "logits/rejected": -3.109375, "logps/chosen": -624.0, "logps/rejected": -1120.0, "loss": 0.2619, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.5625, "rewards/margins": 5.03125, "rewards/rejected": -9.5625, "step": 7680 }, { "epoch": 0.5321431042834406, "grad_norm": 23.81786911051609, "learning_rate": 2.656011921589341e-07, "logits/chosen": -2.9375, "logits/rejected": -2.921875, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.2527, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 4.6875, "rewards/rejected": -9.5625, "step": 7690 }, { "epoch": 0.5328350979170992, "grad_norm": 20.68879054225508, "learning_rate": 2.6499840415519855e-07, "logits/chosen": -2.9375, "logits/rejected": -3.1875, "logps/chosen": -620.0, "logps/rejected": -1104.0, "loss": 0.273, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.4375, "rewards/margins": 4.84375, "rewards/rejected": -9.3125, "step": 7700 }, { "epoch": 0.5335270915507577, "grad_norm": 22.33051531237685, "learning_rate": 2.643955286281249e-07, "logits/chosen": -2.9375, "logits/rejected": -3.15625, "logps/chosen": -640.0, "logps/rejected": -1096.0, "loss": 0.2197, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.5625, "rewards/margins": 4.78125, "rewards/rejected": -9.375, "step": 7710 }, { "epoch": 0.5342190851844163, "grad_norm": 19.50404553034908, "learning_rate": 2.637925690957993e-07, "logits/chosen": -2.890625, "logits/rejected": -3.015625, "logps/chosen": -704.0, "logps/rejected": -1200.0, "loss": 0.2452, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0625, "rewards/margins": 5.0, "rewards/rejected": -10.0625, "step": 7720 }, { "epoch": 0.5349110788180749, "grad_norm": 23.251365122813944, "learning_rate": 2.631895290767981e-07, "logits/chosen": -2.890625, "logits/rejected": -3.078125, "logps/chosen": -684.0, "logps/rejected": -1184.0, "loss": 0.2579, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.09375, "rewards/rejected": -9.9375, "step": 7730 }, { "epoch": 0.5356030724517334, "grad_norm": 22.99107441053348, "learning_rate": 2.6258641209016754e-07, "logits/chosen": -2.90625, "logits/rejected": -3.171875, "logps/chosen": -640.0, "logps/rejected": -1144.0, "loss": 0.2722, "rewards/accuracies": 0.96875, "rewards/chosen": -4.5625, "rewards/margins": 5.125, "rewards/rejected": -9.6875, "step": 7740 }, { "epoch": 0.536295066085392, "grad_norm": 22.6533088988708, "learning_rate": 2.6198322165540276e-07, "logits/chosen": -2.859375, "logits/rejected": -3.109375, "logps/chosen": -644.0, "logps/rejected": -1136.0, "loss": 0.2681, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.59375, "rewards/margins": 5.03125, "rewards/rejected": -9.625, "step": 7750 }, { "epoch": 0.5369870597190506, "grad_norm": 23.12851602089771, "learning_rate": 2.6137996129242753e-07, "logits/chosen": -2.953125, "logits/rejected": -3.1875, "logps/chosen": -624.0, "logps/rejected": -1120.0, "loss": 0.2301, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.625, "rewards/margins": 4.9375, "rewards/rejected": -9.5625, "step": 7760 }, { "epoch": 0.5376790533527092, "grad_norm": 15.420837097858584, "learning_rate": 2.6077663452157393e-07, "logits/chosen": -2.921875, "logits/rejected": -3.09375, "logps/chosen": -604.0, "logps/rejected": -1120.0, "loss": 0.2258, "rewards/accuracies": 0.96875, "rewards/chosen": -4.46875, "rewards/margins": 5.0625, "rewards/rejected": -9.5625, "step": 7770 }, { "epoch": 0.5383710469863677, "grad_norm": 24.78478903430722, "learning_rate": 2.6017324486356125e-07, "logits/chosen": -2.84375, "logits/rejected": -3.203125, "logps/chosen": -632.0, "logps/rejected": -1152.0, "loss": 0.2265, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.625, "rewards/margins": 5.09375, "rewards/rejected": -9.75, "step": 7780 }, { "epoch": 0.5390630406200263, "grad_norm": 27.82252811467731, "learning_rate": 2.5956979583947597e-07, "logits/chosen": -2.984375, "logits/rejected": -3.296875, "logps/chosen": -656.0, "logps/rejected": -1144.0, "loss": 0.2589, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.09375, "rewards/rejected": -9.9375, "step": 7790 }, { "epoch": 0.5397550342536849, "grad_norm": 28.637184693969395, "learning_rate": 2.5896629097075093e-07, "logits/chosen": -2.859375, "logits/rejected": -3.03125, "logps/chosen": -616.0, "logps/rejected": -1096.0, "loss": 0.2638, "rewards/accuracies": 0.90625, "rewards/chosen": -4.46875, "rewards/margins": 4.75, "rewards/rejected": -9.1875, "step": 7800 }, { "epoch": 0.5404470278873434, "grad_norm": 19.980697482344127, "learning_rate": 2.5836273377914485e-07, "logits/chosen": -2.9375, "logits/rejected": -3.265625, "logps/chosen": -616.0, "logps/rejected": -1064.0, "loss": 0.2563, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.375, "rewards/margins": 4.75, "rewards/rejected": -9.125, "step": 7810 }, { "epoch": 0.541139021521002, "grad_norm": 16.35933625330905, "learning_rate": 2.577591277867218e-07, "logits/chosen": -2.859375, "logits/rejected": -3.0625, "logps/chosen": -632.0, "logps/rejected": -1064.0, "loss": 0.2208, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 4.40625, "rewards/rejected": -9.0, "step": 7820 }, { "epoch": 0.5418310151546606, "grad_norm": 14.399081517586481, "learning_rate": 2.5715547651583053e-07, "logits/chosen": -2.796875, "logits/rejected": -3.015625, "logps/chosen": -652.0, "logps/rejected": -1112.0, "loss": 0.2543, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.78125, "rewards/rejected": -9.4375, "step": 7830 }, { "epoch": 0.5425230087883192, "grad_norm": 28.929828999885697, "learning_rate": 2.565517834890842e-07, "logits/chosen": -2.96875, "logits/rejected": -3.171875, "logps/chosen": -612.0, "logps/rejected": -1096.0, "loss": 0.2904, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.40625, "rewards/margins": 5.0, "rewards/rejected": -9.4375, "step": 7840 }, { "epoch": 0.5432150024219777, "grad_norm": 18.99487146769679, "learning_rate": 2.5594805222933945e-07, "logits/chosen": -2.96875, "logits/rejected": -3.171875, "logps/chosen": -656.0, "logps/rejected": -1096.0, "loss": 0.2879, "rewards/accuracies": 0.9375, "rewards/chosen": -4.78125, "rewards/margins": 4.65625, "rewards/rejected": -9.4375, "step": 7850 }, { "epoch": 0.5439069960556363, "grad_norm": 23.482729010536524, "learning_rate": 2.5534428625967626e-07, "logits/chosen": -3.0, "logits/rejected": -3.265625, "logps/chosen": -628.0, "logps/rejected": -1088.0, "loss": 0.2517, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.625, "rewards/margins": 4.5625, "rewards/rejected": -9.1875, "step": 7860 }, { "epoch": 0.5445989896892949, "grad_norm": 21.640856571770748, "learning_rate": 2.5474048910337676e-07, "logits/chosen": -2.765625, "logits/rejected": -2.984375, "logps/chosen": -640.0, "logps/rejected": -1104.0, "loss": 0.2494, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.53125, "rewards/margins": 4.84375, "rewards/rejected": -9.375, "step": 7870 }, { "epoch": 0.5452909833229534, "grad_norm": 24.033014712831385, "learning_rate": 2.5413666428390564e-07, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -600.0, "logps/rejected": -1064.0, "loss": 0.2916, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.09375, "rewards/margins": 4.96875, "rewards/rejected": -9.0625, "step": 7880 }, { "epoch": 0.545982976956612, "grad_norm": 24.671470320409533, "learning_rate": 2.535328153248884e-07, "logits/chosen": -2.75, "logits/rejected": -2.78125, "logps/chosen": -596.0, "logps/rejected": -1016.0, "loss": 0.2695, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.15625, "rewards/margins": 4.28125, "rewards/rejected": -8.4375, "step": 7890 }, { "epoch": 0.5466749705902706, "grad_norm": 24.36354674686656, "learning_rate": 2.529289457500919e-07, "logits/chosen": -3.015625, "logits/rejected": -3.28125, "logps/chosen": -640.0, "logps/rejected": -1056.0, "loss": 0.2734, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.71875, "rewards/margins": 4.25, "rewards/rejected": -8.9375, "step": 7900 }, { "epoch": 0.5473669642239292, "grad_norm": 17.160357491614455, "learning_rate": 2.5232505908340295e-07, "logits/chosen": -2.96875, "logits/rejected": -3.09375, "logps/chosen": -640.0, "logps/rejected": -1072.0, "loss": 0.238, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5625, "rewards/margins": 4.4375, "rewards/rejected": -9.0, "step": 7910 }, { "epoch": 0.5480589578575877, "grad_norm": 19.886512028684713, "learning_rate": 2.5172115884880854e-07, "logits/chosen": -2.875, "logits/rejected": -3.25, "logps/chosen": -636.0, "logps/rejected": -1096.0, "loss": 0.2429, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.53125, "rewards/margins": 4.84375, "rewards/rejected": -9.375, "step": 7920 }, { "epoch": 0.5487509514912463, "grad_norm": 41.95438554747116, "learning_rate": 2.511172485703742e-07, "logits/chosen": -2.875, "logits/rejected": -3.0625, "logps/chosen": -640.0, "logps/rejected": -1088.0, "loss": 0.2774, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.625, "rewards/rejected": -9.375, "step": 7930 }, { "epoch": 0.5494429451249049, "grad_norm": 23.58195224488734, "learning_rate": 2.5051333177222474e-07, "logits/chosen": -3.0, "logits/rejected": -3.359375, "logps/chosen": -616.0, "logps/rejected": -1112.0, "loss": 0.2784, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.03125, "rewards/rejected": -9.625, "step": 7940 }, { "epoch": 0.5501349387585635, "grad_norm": 22.25540960214912, "learning_rate": 2.499094119785224e-07, "logits/chosen": -3.0, "logits/rejected": -3.25, "logps/chosen": -632.0, "logps/rejected": -1072.0, "loss": 0.2762, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.53125, "rewards/margins": 4.5625, "rewards/rejected": -9.0625, "step": 7950 }, { "epoch": 0.550826932392222, "grad_norm": 25.26913143070793, "learning_rate": 2.493054927134475e-07, "logits/chosen": -2.9375, "logits/rejected": -3.125, "logps/chosen": -648.0, "logps/rejected": -1128.0, "loss": 0.2486, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.75, "rewards/margins": 4.84375, "rewards/rejected": -9.625, "step": 7960 }, { "epoch": 0.5515189260258806, "grad_norm": 28.196910321016826, "learning_rate": 2.487015775011768e-07, "logits/chosen": -2.765625, "logits/rejected": -3.1875, "logps/chosen": -632.0, "logps/rejected": -1096.0, "loss": 0.1975, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.03125, "rewards/rejected": -9.5625, "step": 7970 }, { "epoch": 0.5522109196595392, "grad_norm": 33.45717389591765, "learning_rate": 2.480976698658637e-07, "logits/chosen": -2.890625, "logits/rejected": -2.953125, "logps/chosen": -624.0, "logps/rejected": -1136.0, "loss": 0.2542, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.40625, "rewards/margins": 5.15625, "rewards/rejected": -9.5625, "step": 7980 }, { "epoch": 0.5529029132931977, "grad_norm": 17.69968542618763, "learning_rate": 2.474937733316172e-07, "logits/chosen": -2.890625, "logits/rejected": -3.125, "logps/chosen": -640.0, "logps/rejected": -1128.0, "loss": 0.2319, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.875, "rewards/rejected": -9.5625, "step": 7990 }, { "epoch": 0.5535949069268563, "grad_norm": 19.763826496203613, "learning_rate": 2.468898914224815e-07, "logits/chosen": -2.953125, "logits/rejected": -3.203125, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2355, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 8000 }, { "epoch": 0.5535949069268563, "eval_logits/chosen": -2.921875, "eval_logits/rejected": -3.140625, "eval_logps/chosen": -708.0, "eval_logps/rejected": -1096.0, "eval_loss": 0.22711098194122314, "eval_rewards/accuracies": 0.898299515247345, "eval_rewards/chosen": -5.1875, "eval_rewards/margins": 4.15625, "eval_rewards/rejected": -9.375, "eval_runtime": 2936.17, "eval_samples_per_second": 33.32, "eval_steps_per_second": 0.521, "step": 8000 }, { "epoch": 0.5542869005605149, "grad_norm": 27.71789675472398, "learning_rate": 2.4628602766241566e-07, "logits/chosen": -2.875, "logits/rejected": -3.234375, "logps/chosen": -700.0, "logps/rejected": -1120.0, "loss": 0.2761, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 4.5625, "rewards/rejected": -9.625, "step": 8010 }, { "epoch": 0.5549788941941735, "grad_norm": 24.585534862730682, "learning_rate": 2.4568218557527285e-07, "logits/chosen": -2.78125, "logits/rejected": -3.078125, "logps/chosen": -664.0, "logps/rejected": -1104.0, "loss": 0.2536, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.40625, "rewards/rejected": -9.25, "step": 8020 }, { "epoch": 0.555670887827832, "grad_norm": 18.09866445359616, "learning_rate": 2.450783686847794e-07, "logits/chosen": -2.84375, "logits/rejected": -3.0, "logps/chosen": -620.0, "logps/rejected": -1040.0, "loss": 0.2034, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.46875, "rewards/margins": 4.34375, "rewards/rejected": -8.8125, "step": 8030 }, { "epoch": 0.5563628814614906, "grad_norm": 19.91612900783099, "learning_rate": 2.444745805145149e-07, "logits/chosen": -2.890625, "logits/rejected": -3.234375, "logps/chosen": -612.0, "logps/rejected": -1064.0, "loss": 0.2256, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.4375, "rewards/margins": 4.65625, "rewards/rejected": -9.125, "step": 8040 }, { "epoch": 0.5570548750951492, "grad_norm": 23.743665604488587, "learning_rate": 2.4387082458789147e-07, "logits/chosen": -2.75, "logits/rejected": -3.015625, "logps/chosen": -664.0, "logps/rejected": -1120.0, "loss": 0.2281, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 4.78125, "rewards/rejected": -9.5, "step": 8050 }, { "epoch": 0.5577468687288077, "grad_norm": 31.00627122129957, "learning_rate": 2.432671044281326e-07, "logits/chosen": -2.9375, "logits/rejected": -3.109375, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2941, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.25, "rewards/rejected": -10.0, "step": 8060 }, { "epoch": 0.5584388623624663, "grad_norm": 18.709081972373298, "learning_rate": 2.426634235582535e-07, "logits/chosen": -2.984375, "logits/rejected": -3.109375, "logps/chosen": -648.0, "logps/rejected": -1136.0, "loss": 0.2184, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.03125, "rewards/rejected": -9.75, "step": 8070 }, { "epoch": 0.5591308559961249, "grad_norm": 19.070496688939585, "learning_rate": 2.4205978550104005e-07, "logits/chosen": -2.921875, "logits/rejected": -3.453125, "logps/chosen": -644.0, "logps/rejected": -1120.0, "loss": 0.2442, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.78125, "rewards/margins": 4.875, "rewards/rejected": -9.6875, "step": 8080 }, { "epoch": 0.5598228496297835, "grad_norm": 20.52281655160402, "learning_rate": 2.41456193779028e-07, "logits/chosen": -2.859375, "logits/rejected": -3.25, "logps/chosen": -616.0, "logps/rejected": -1112.0, "loss": 0.2304, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.375, "rewards/margins": 5.09375, "rewards/rejected": -9.4375, "step": 8090 }, { "epoch": 0.560514843263442, "grad_norm": 23.974313089043854, "learning_rate": 2.4085265191448295e-07, "logits/chosen": -2.890625, "logits/rejected": -3.140625, "logps/chosen": -632.0, "logps/rejected": -1112.0, "loss": 0.2698, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 4.875, "rewards/rejected": -9.4375, "step": 8100 }, { "epoch": 0.5612068368971005, "grad_norm": 22.00071357806052, "learning_rate": 2.402491634293796e-07, "logits/chosen": -2.9375, "logits/rejected": -3.078125, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.2526, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.75, "rewards/margins": 5.15625, "rewards/rejected": -9.875, "step": 8110 }, { "epoch": 0.5618988305307591, "grad_norm": 26.032334952396834, "learning_rate": 2.3964573184538105e-07, "logits/chosen": -2.859375, "logits/rejected": -3.28125, "logps/chosen": -620.0, "logps/rejected": -1072.0, "loss": 0.2458, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.53125, "rewards/rejected": -9.1875, "step": 8120 }, { "epoch": 0.5625908241644176, "grad_norm": 27.935455621780502, "learning_rate": 2.390423606838183e-07, "logits/chosen": -3.03125, "logits/rejected": -3.296875, "logps/chosen": -604.0, "logps/rejected": -1080.0, "loss": 0.2138, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5, "rewards/margins": 4.71875, "rewards/rejected": -9.1875, "step": 8130 }, { "epoch": 0.5632828177980762, "grad_norm": 16.381026689697638, "learning_rate": 2.3843905346566993e-07, "logits/chosen": -2.953125, "logits/rejected": -3.15625, "logps/chosen": -628.0, "logps/rejected": -1096.0, "loss": 0.2372, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.5625, "rewards/margins": 4.71875, "rewards/rejected": -9.25, "step": 8140 }, { "epoch": 0.5639748114317348, "grad_norm": 15.147534272042071, "learning_rate": 2.3783581371154116e-07, "logits/chosen": -2.96875, "logits/rejected": -3.09375, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.2599, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.8125, "rewards/margins": 4.90625, "rewards/rejected": -9.6875, "step": 8150 }, { "epoch": 0.5646668050653934, "grad_norm": 25.27739936212574, "learning_rate": 2.372326449416437e-07, "logits/chosen": -2.9375, "logits/rejected": -3.15625, "logps/chosen": -676.0, "logps/rejected": -1096.0, "loss": 0.2251, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.46875, "rewards/rejected": -9.4375, "step": 8160 }, { "epoch": 0.5653587986990519, "grad_norm": 23.119762804902788, "learning_rate": 2.36629550675775e-07, "logits/chosen": -3.015625, "logits/rejected": -2.953125, "logps/chosen": -676.0, "logps/rejected": -1136.0, "loss": 0.2551, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.875, "rewards/margins": 4.6875, "rewards/rejected": -9.5625, "step": 8170 }, { "epoch": 0.5660507923327105, "grad_norm": 23.92882029232927, "learning_rate": 2.3602653443329764e-07, "logits/chosen": -2.75, "logits/rejected": -3.0625, "logps/chosen": -656.0, "logps/rejected": -1128.0, "loss": 0.2568, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.8125, "rewards/rejected": -9.625, "step": 8180 }, { "epoch": 0.5667427859663691, "grad_norm": 25.678836157076812, "learning_rate": 2.3542359973311893e-07, "logits/chosen": -2.859375, "logits/rejected": -3.0625, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.2161, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 5.15625, "rewards/rejected": -9.8125, "step": 8190 }, { "epoch": 0.5674347796000276, "grad_norm": 24.44905665102302, "learning_rate": 2.348207500936706e-07, "logits/chosen": -3.03125, "logits/rejected": -3.46875, "logps/chosen": -632.0, "logps/rejected": -1120.0, "loss": 0.272, "rewards/accuracies": 0.96875, "rewards/chosen": -4.53125, "rewards/margins": 5.25, "rewards/rejected": -9.75, "step": 8200 }, { "epoch": 0.5681267732336862, "grad_norm": 24.40327361878364, "learning_rate": 2.3421798903288743e-07, "logits/chosen": -2.921875, "logits/rejected": -3.375, "logps/chosen": -636.0, "logps/rejected": -1088.0, "loss": 0.2407, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5, "rewards/margins": 4.875, "rewards/rejected": -9.375, "step": 8210 }, { "epoch": 0.5688187668673448, "grad_norm": 28.26505680856842, "learning_rate": 2.3361532006818786e-07, "logits/chosen": -2.96875, "logits/rejected": -3.265625, "logps/chosen": -668.0, "logps/rejected": -1128.0, "loss": 0.2403, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 4.75, "rewards/rejected": -9.625, "step": 8220 }, { "epoch": 0.5695107605010034, "grad_norm": 15.347558810518105, "learning_rate": 2.330127467164527e-07, "logits/chosen": -3.109375, "logits/rejected": -3.375, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.2777, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 4.8125, "rewards/rejected": -9.8125, "step": 8230 }, { "epoch": 0.5702027541346619, "grad_norm": 30.88423592114608, "learning_rate": 2.3241027249400474e-07, "logits/chosen": -3.03125, "logits/rejected": -3.265625, "logps/chosen": -636.0, "logps/rejected": -1120.0, "loss": 0.2481, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.59375, "rewards/margins": 5.0, "rewards/rejected": -9.5625, "step": 8240 }, { "epoch": 0.5708947477683205, "grad_norm": 23.685136711437274, "learning_rate": 2.318079009165883e-07, "logits/chosen": -3.078125, "logits/rejected": -3.53125, "logps/chosen": -632.0, "logps/rejected": -1112.0, "loss": 0.2134, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.625, "rewards/margins": 5.0625, "rewards/rejected": -9.6875, "step": 8250 }, { "epoch": 0.5715867414019791, "grad_norm": 27.275193025061736, "learning_rate": 2.3120563549934894e-07, "logits/chosen": -2.859375, "logits/rejected": -3.265625, "logps/chosen": -664.0, "logps/rejected": -1104.0, "loss": 0.2696, "rewards/accuracies": 0.90625, "rewards/chosen": -4.71875, "rewards/margins": 4.75, "rewards/rejected": -9.5, "step": 8260 }, { "epoch": 0.5722787350356376, "grad_norm": 30.25243615445592, "learning_rate": 2.3060347975681238e-07, "logits/chosen": -2.96875, "logits/rejected": -3.03125, "logps/chosen": -632.0, "logps/rejected": -1128.0, "loss": 0.2588, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 5.0, "rewards/rejected": -9.625, "step": 8270 }, { "epoch": 0.5729707286692962, "grad_norm": 12.907439896396449, "learning_rate": 2.300014372028646e-07, "logits/chosen": -2.859375, "logits/rejected": -3.28125, "logps/chosen": -664.0, "logps/rejected": -1088.0, "loss": 0.2056, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 4.65625, "rewards/rejected": -9.375, "step": 8280 }, { "epoch": 0.5736627223029548, "grad_norm": 23.18719791603458, "learning_rate": 2.29399511350731e-07, "logits/chosen": -2.828125, "logits/rejected": -3.046875, "logps/chosen": -656.0, "logps/rejected": -1144.0, "loss": 0.2645, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.65625, "rewards/margins": 4.90625, "rewards/rejected": -9.5625, "step": 8290 }, { "epoch": 0.5743547159366134, "grad_norm": 22.295516107805685, "learning_rate": 2.2879770571295578e-07, "logits/chosen": -3.078125, "logits/rejected": -3.296875, "logps/chosen": -632.0, "logps/rejected": -1128.0, "loss": 0.2422, "rewards/accuracies": 0.9375, "rewards/chosen": -4.59375, "rewards/margins": 5.03125, "rewards/rejected": -9.625, "step": 8300 }, { "epoch": 0.5750467095702719, "grad_norm": 15.785745378223927, "learning_rate": 2.2819602380138194e-07, "logits/chosen": -3.03125, "logits/rejected": -3.34375, "logps/chosen": -684.0, "logps/rejected": -1176.0, "loss": 0.2484, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.09375, "rewards/rejected": -10.0625, "step": 8310 }, { "epoch": 0.5757387032039305, "grad_norm": 19.793982019551603, "learning_rate": 2.275944691271303e-07, "logits/chosen": -3.0, "logits/rejected": -3.09375, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.2268, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.59375, "rewards/rejected": -9.4375, "step": 8320 }, { "epoch": 0.5764306968375891, "grad_norm": 16.75394879835051, "learning_rate": 2.269930452005792e-07, "logits/chosen": -2.890625, "logits/rejected": -3.109375, "logps/chosen": -656.0, "logps/rejected": -1104.0, "loss": 0.2325, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.71875, "rewards/rejected": -9.4375, "step": 8330 }, { "epoch": 0.5771226904712476, "grad_norm": 13.781571086929633, "learning_rate": 2.263917555313439e-07, "logits/chosen": -2.90625, "logits/rejected": -3.265625, "logps/chosen": -656.0, "logps/rejected": -1136.0, "loss": 0.215, "rewards/accuracies": 0.9375, "rewards/chosen": -4.8125, "rewards/margins": 4.9375, "rewards/rejected": -9.75, "step": 8340 }, { "epoch": 0.5778146841049062, "grad_norm": 27.36736539342407, "learning_rate": 2.2579060362825644e-07, "logits/chosen": -2.84375, "logits/rejected": -3.078125, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.2075, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.21875, "rewards/rejected": -10.1875, "step": 8350 }, { "epoch": 0.5785066777385648, "grad_norm": 27.290253743265364, "learning_rate": 2.251895929993448e-07, "logits/chosen": -3.015625, "logits/rejected": -3.015625, "logps/chosen": -660.0, "logps/rejected": -1152.0, "loss": 0.2318, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.0, "rewards/rejected": -9.875, "step": 8360 }, { "epoch": 0.5791986713722234, "grad_norm": 20.08690074137453, "learning_rate": 2.2458872715181243e-07, "logits/chosen": -2.984375, "logits/rejected": -3.015625, "logps/chosen": -632.0, "logps/rejected": -1184.0, "loss": 0.2238, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5, "rewards/margins": 5.625, "rewards/rejected": -10.125, "step": 8370 }, { "epoch": 0.5798906650058819, "grad_norm": 25.822793435340643, "learning_rate": 2.2398800959201805e-07, "logits/chosen": -2.859375, "logits/rejected": -3.0, "logps/chosen": -672.0, "logps/rejected": -1136.0, "loss": 0.2653, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.90625, "rewards/rejected": -9.5, "step": 8380 }, { "epoch": 0.5805826586395405, "grad_norm": 17.781493200469438, "learning_rate": 2.2338744382545513e-07, "logits/chosen": -3.015625, "logits/rejected": -3.203125, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.249, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.0, "rewards/rejected": -9.9375, "step": 8390 }, { "epoch": 0.5812746522731991, "grad_norm": 16.182359493162735, "learning_rate": 2.2278703335673103e-07, "logits/chosen": -2.953125, "logits/rejected": -3.046875, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.2432, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 4.875, "rewards/rejected": -10.0625, "step": 8400 }, { "epoch": 0.5819666459068576, "grad_norm": 25.833840975944483, "learning_rate": 2.2218678168954724e-07, "logits/chosen": -2.921875, "logits/rejected": -3.28125, "logps/chosen": -672.0, "logps/rejected": -1120.0, "loss": 0.2492, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 4.71875, "rewards/rejected": -9.6875, "step": 8410 }, { "epoch": 0.5826586395405162, "grad_norm": 21.51358204311537, "learning_rate": 2.215866923266784e-07, "logits/chosen": -2.84375, "logits/rejected": -2.953125, "logps/chosen": -700.0, "logps/rejected": -1192.0, "loss": 0.2202, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 5.125, "rewards/rejected": -10.25, "step": 8420 }, { "epoch": 0.5833506331741748, "grad_norm": 25.305418942265618, "learning_rate": 2.209867687699519e-07, "logits/chosen": -2.890625, "logits/rejected": -3.0, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.2479, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 8430 }, { "epoch": 0.5840426268078334, "grad_norm": 23.552960108483614, "learning_rate": 2.2038701452022768e-07, "logits/chosen": -2.796875, "logits/rejected": -2.96875, "logps/chosen": -652.0, "logps/rejected": -1120.0, "loss": 0.24, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.625, "rewards/margins": 4.875, "rewards/rejected": -9.5, "step": 8440 }, { "epoch": 0.5847346204414919, "grad_norm": 20.591895974159037, "learning_rate": 2.1978743307737788e-07, "logits/chosen": -2.984375, "logits/rejected": -3.328125, "logps/chosen": -656.0, "logps/rejected": -1128.0, "loss": 0.227, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 4.9375, "rewards/rejected": -9.8125, "step": 8450 }, { "epoch": 0.5854266140751505, "grad_norm": 21.231333702804324, "learning_rate": 2.1918802794026585e-07, "logits/chosen": -2.796875, "logits/rejected": -3.078125, "logps/chosen": -640.0, "logps/rejected": -1096.0, "loss": 0.2501, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.5625, "rewards/rejected": -9.1875, "step": 8460 }, { "epoch": 0.5861186077088091, "grad_norm": 26.615385557126487, "learning_rate": 2.1858880260672633e-07, "logits/chosen": -2.875, "logits/rejected": -3.125, "logps/chosen": -648.0, "logps/rejected": -1064.0, "loss": 0.2745, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.625, "rewards/margins": 4.4375, "rewards/rejected": -9.0625, "step": 8470 }, { "epoch": 0.5868106013424677, "grad_norm": 29.122396263571154, "learning_rate": 2.1798976057354484e-07, "logits/chosen": -2.859375, "logits/rejected": -3.125, "logps/chosen": -676.0, "logps/rejected": -1080.0, "loss": 0.2389, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 4.40625, "rewards/rejected": -9.3125, "step": 8480 }, { "epoch": 0.5875025949761262, "grad_norm": 33.91511023684988, "learning_rate": 2.173909053364371e-07, "logits/chosen": -2.90625, "logits/rejected": -2.984375, "logps/chosen": -640.0, "logps/rejected": -1088.0, "loss": 0.2338, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 4.5, "rewards/rejected": -9.0625, "step": 8490 }, { "epoch": 0.5881945886097848, "grad_norm": 24.592327127506675, "learning_rate": 2.1679224039002884e-07, "logits/chosen": -2.875, "logits/rejected": -3.34375, "logps/chosen": -624.0, "logps/rejected": -1088.0, "loss": 0.231, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.59375, "rewards/margins": 4.71875, "rewards/rejected": -9.3125, "step": 8500 }, { "epoch": 0.5888865822434434, "grad_norm": 17.95266416861229, "learning_rate": 2.1619376922783552e-07, "logits/chosen": -2.8125, "logits/rejected": -2.953125, "logps/chosen": -704.0, "logps/rejected": -1160.0, "loss": 0.2439, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 4.8125, "rewards/rejected": -9.8125, "step": 8510 }, { "epoch": 0.5895785758771019, "grad_norm": 22.952392023033404, "learning_rate": 2.1559549534224148e-07, "logits/chosen": -2.96875, "logits/rejected": -3.15625, "logps/chosen": -628.0, "logps/rejected": -1176.0, "loss": 0.2543, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.40625, "rewards/margins": 5.46875, "rewards/rejected": -9.875, "step": 8520 }, { "epoch": 0.5902705695107605, "grad_norm": 19.62327184506899, "learning_rate": 2.1499742222448003e-07, "logits/chosen": -2.921875, "logits/rejected": -3.234375, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.2309, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 4.96875, "rewards/rejected": -9.8125, "step": 8530 }, { "epoch": 0.5909625631444191, "grad_norm": 16.109658050114074, "learning_rate": 2.1439955336461294e-07, "logits/chosen": -2.84375, "logits/rejected": -3.140625, "logps/chosen": -636.0, "logps/rejected": -1128.0, "loss": 0.2262, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.1875, "rewards/rejected": -9.75, "step": 8540 }, { "epoch": 0.5916545567780777, "grad_norm": 23.085282153650667, "learning_rate": 2.1380189225150973e-07, "logits/chosen": -2.96875, "logits/rejected": -3.296875, "logps/chosen": -616.0, "logps/rejected": -1112.0, "loss": 0.2333, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.53125, "rewards/margins": 4.9375, "rewards/rejected": -9.4375, "step": 8550 }, { "epoch": 0.5923465504117362, "grad_norm": 20.67891931427504, "learning_rate": 2.1320444237282809e-07, "logits/chosen": -3.046875, "logits/rejected": -3.265625, "logps/chosen": -608.0, "logps/rejected": -1112.0, "loss": 0.2191, "rewards/accuracies": 0.96875, "rewards/chosen": -4.4375, "rewards/margins": 5.09375, "rewards/rejected": -9.5, "step": 8560 }, { "epoch": 0.5930385440453948, "grad_norm": 18.618201797875642, "learning_rate": 2.1260720721499266e-07, "logits/chosen": -3.015625, "logits/rejected": -3.125, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.2684, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 4.96875, "rewards/rejected": -9.8125, "step": 8570 }, { "epoch": 0.5937305376790534, "grad_norm": 26.397048960423852, "learning_rate": 2.1201019026317518e-07, "logits/chosen": -2.890625, "logits/rejected": -3.09375, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.2484, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 4.9375, "rewards/rejected": -10.0625, "step": 8580 }, { "epoch": 0.5944225313127119, "grad_norm": 16.769633648416338, "learning_rate": 2.114133950012741e-07, "logits/chosen": -3.03125, "logits/rejected": -3.21875, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.2161, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 4.8125, "rewards/rejected": -9.75, "step": 8590 }, { "epoch": 0.5951145249463705, "grad_norm": 16.838898684400174, "learning_rate": 2.1081682491189428e-07, "logits/chosen": -3.015625, "logits/rejected": -3.296875, "logps/chosen": -644.0, "logps/rejected": -1144.0, "loss": 0.2338, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.875, "rewards/margins": 4.96875, "rewards/rejected": -9.8125, "step": 8600 }, { "epoch": 0.5958065185800291, "grad_norm": 27.622138875187993, "learning_rate": 2.1022048347632622e-07, "logits/chosen": -2.984375, "logits/rejected": -3.09375, "logps/chosen": -620.0, "logps/rejected": -1152.0, "loss": 0.2371, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.5, "rewards/margins": 5.1875, "rewards/rejected": -9.6875, "step": 8610 }, { "epoch": 0.5964985122136877, "grad_norm": 23.728205676545375, "learning_rate": 2.0962437417452656e-07, "logits/chosen": -2.890625, "logits/rejected": -3.125, "logps/chosen": -668.0, "logps/rejected": -1160.0, "loss": 0.2363, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 8620 }, { "epoch": 0.5971905058473462, "grad_norm": 28.835914101603766, "learning_rate": 2.0902850048509712e-07, "logits/chosen": -2.96875, "logits/rejected": -3.203125, "logps/chosen": -652.0, "logps/rejected": -1120.0, "loss": 0.2489, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.75, "rewards/margins": 4.78125, "rewards/rejected": -9.5, "step": 8630 }, { "epoch": 0.5978824994810048, "grad_norm": 33.28684863494531, "learning_rate": 2.084328658852647e-07, "logits/chosen": -3.0, "logits/rejected": -3.34375, "logps/chosen": -648.0, "logps/rejected": -1088.0, "loss": 0.2583, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.46875, "rewards/rejected": -9.375, "step": 8640 }, { "epoch": 0.5985744931146634, "grad_norm": 15.669656138256439, "learning_rate": 2.0783747385086096e-07, "logits/chosen": -2.9375, "logits/rejected": -3.078125, "logps/chosen": -688.0, "logps/rejected": -1184.0, "loss": 0.2641, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 4.875, "rewards/rejected": -9.9375, "step": 8650 }, { "epoch": 0.599266486748322, "grad_norm": 27.803246662513676, "learning_rate": 2.072423278563023e-07, "logits/chosen": -3.015625, "logits/rejected": -3.328125, "logps/chosen": -632.0, "logps/rejected": -1072.0, "loss": 0.2684, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 4.59375, "rewards/rejected": -9.125, "step": 8660 }, { "epoch": 0.5999584803819805, "grad_norm": 25.472378467610802, "learning_rate": 2.0664743137456894e-07, "logits/chosen": -2.90625, "logits/rejected": -3.125, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.2341, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.75, "rewards/margins": 4.71875, "rewards/rejected": -9.5, "step": 8670 }, { "epoch": 0.6006504740156391, "grad_norm": 27.018793926901864, "learning_rate": 2.0605278787718538e-07, "logits/chosen": -2.921875, "logits/rejected": -3.28125, "logps/chosen": -624.0, "logps/rejected": -1096.0, "loss": 0.2538, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.5625, "rewards/margins": 4.78125, "rewards/rejected": -9.375, "step": 8680 }, { "epoch": 0.6013424676492977, "grad_norm": 38.44393552738461, "learning_rate": 2.054584008341998e-07, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -608.0, "logps/rejected": -1096.0, "loss": 0.2704, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.25, "rewards/margins": 5.125, "rewards/rejected": -9.375, "step": 8690 }, { "epoch": 0.6020344612829562, "grad_norm": 19.039093489315704, "learning_rate": 2.048642737141635e-07, "logits/chosen": -2.875, "logits/rejected": -3.15625, "logps/chosen": -660.0, "logps/rejected": -1104.0, "loss": 0.2459, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 4.53125, "rewards/rejected": -9.4375, "step": 8700 }, { "epoch": 0.6027264549166148, "grad_norm": 22.53052771385326, "learning_rate": 2.042704099841115e-07, "logits/chosen": -2.921875, "logits/rejected": -3.40625, "logps/chosen": -628.0, "logps/rejected": -1104.0, "loss": 0.2339, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.53125, "rewards/margins": 4.96875, "rewards/rejected": -9.5, "step": 8710 }, { "epoch": 0.6034184485502734, "grad_norm": 33.648897237787104, "learning_rate": 2.0367681310954157e-07, "logits/chosen": -2.796875, "logits/rejected": -3.0625, "logps/chosen": -676.0, "logps/rejected": -1136.0, "loss": 0.2945, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 4.84375, "rewards/rejected": -9.75, "step": 8720 }, { "epoch": 0.604110442183932, "grad_norm": 18.53426583638598, "learning_rate": 2.0308348655439406e-07, "logits/chosen": -2.90625, "logits/rejected": -2.921875, "logps/chosen": -624.0, "logps/rejected": -1104.0, "loss": 0.2844, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.5625, "rewards/margins": 4.8125, "rewards/rejected": -9.375, "step": 8730 }, { "epoch": 0.6048024358175905, "grad_norm": 29.361678738076023, "learning_rate": 2.024904337810321e-07, "logits/chosen": -2.96875, "logits/rejected": -3.03125, "logps/chosen": -600.0, "logps/rejected": -1112.0, "loss": 0.2047, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.21875, "rewards/margins": 5.15625, "rewards/rejected": -9.375, "step": 8740 }, { "epoch": 0.6054944294512491, "grad_norm": 19.89813288949973, "learning_rate": 2.0189765825022132e-07, "logits/chosen": -2.9375, "logits/rejected": -3.09375, "logps/chosen": -636.0, "logps/rejected": -1088.0, "loss": 0.2201, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.625, "rewards/margins": 4.625, "rewards/rejected": -9.25, "step": 8750 }, { "epoch": 0.6061864230849077, "grad_norm": 19.959689588247542, "learning_rate": 2.0130516342110894e-07, "logits/chosen": -2.8125, "logits/rejected": -3.0, "logps/chosen": -644.0, "logps/rejected": -1104.0, "loss": 0.228, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.59375, "rewards/margins": 4.78125, "rewards/rejected": -9.375, "step": 8760 }, { "epoch": 0.6068784167185662, "grad_norm": 16.417548979151217, "learning_rate": 2.007129527512047e-07, "logits/chosen": -2.890625, "logits/rejected": -3.078125, "logps/chosen": -644.0, "logps/rejected": -1136.0, "loss": 0.1907, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 5.0625, "rewards/rejected": -9.75, "step": 8770 }, { "epoch": 0.6075704103522248, "grad_norm": 20.74838363498926, "learning_rate": 2.0012102969635994e-07, "logits/chosen": -2.921875, "logits/rejected": -3.21875, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2098, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 5.15625, "rewards/rejected": -9.8125, "step": 8780 }, { "epoch": 0.6082624039858834, "grad_norm": 15.65574092345487, "learning_rate": 1.995293977107475e-07, "logits/chosen": -2.96875, "logits/rejected": -3.15625, "logps/chosen": -624.0, "logps/rejected": -1160.0, "loss": 0.256, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5, "rewards/margins": 5.4375, "rewards/rejected": -9.9375, "step": 8790 }, { "epoch": 0.608954397619542, "grad_norm": 29.80129007820759, "learning_rate": 1.989380602468417e-07, "logits/chosen": -2.84375, "logits/rejected": -2.96875, "logps/chosen": -636.0, "logps/rejected": -1152.0, "loss": 0.2289, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.46875, "rewards/margins": 5.375, "rewards/rejected": -9.8125, "step": 8800 }, { "epoch": 0.6096463912532005, "grad_norm": 27.717437506501795, "learning_rate": 1.9834702075539848e-07, "logits/chosen": -2.921875, "logits/rejected": -3.25, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2763, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.90625, "rewards/rejected": -9.875, "step": 8810 }, { "epoch": 0.610338384886859, "grad_norm": 23.106698245966527, "learning_rate": 1.9775628268543455e-07, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2388, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 8820 }, { "epoch": 0.6110303785205176, "grad_norm": 15.654097568318399, "learning_rate": 1.971658494842079e-07, "logits/chosen": -3.03125, "logits/rejected": -3.21875, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.2764, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.125, "rewards/margins": 4.75, "rewards/rejected": -9.875, "step": 8830 }, { "epoch": 0.6117223721541761, "grad_norm": 22.786412169548743, "learning_rate": 1.965757245971975e-07, "logits/chosen": -2.90625, "logits/rejected": -3.21875, "logps/chosen": -684.0, "logps/rejected": -1120.0, "loss": 0.2293, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 4.46875, "rewards/rejected": -9.5, "step": 8840 }, { "epoch": 0.6124143657878347, "grad_norm": 19.571534021832292, "learning_rate": 1.9598591146808286e-07, "logits/chosen": -2.9375, "logits/rejected": -3.265625, "logps/chosen": -644.0, "logps/rejected": -1112.0, "loss": 0.2492, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 4.8125, "rewards/rejected": -9.5, "step": 8850 }, { "epoch": 0.6131063594214933, "grad_norm": 23.360854971324084, "learning_rate": 1.9539641353872455e-07, "logits/chosen": -3.0, "logits/rejected": -3.1875, "logps/chosen": -636.0, "logps/rejected": -1128.0, "loss": 0.2082, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.03125, "rewards/rejected": -9.6875, "step": 8860 }, { "epoch": 0.6137983530551518, "grad_norm": 20.837636934555608, "learning_rate": 1.9480723424914374e-07, "logits/chosen": -3.015625, "logits/rejected": -3.328125, "logps/chosen": -648.0, "logps/rejected": -1120.0, "loss": 0.2241, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.9375, "rewards/rejected": -9.625, "step": 8870 }, { "epoch": 0.6144903466888104, "grad_norm": 22.55897027791573, "learning_rate": 1.9421837703750192e-07, "logits/chosen": -2.90625, "logits/rejected": -3.140625, "logps/chosen": -660.0, "logps/rejected": -1096.0, "loss": 0.246, "rewards/accuracies": 0.9375, "rewards/chosen": -4.8125, "rewards/margins": 4.625, "rewards/rejected": -9.4375, "step": 8880 }, { "epoch": 0.615182340322469, "grad_norm": 24.036138258702263, "learning_rate": 1.936298453400813e-07, "logits/chosen": -2.765625, "logits/rejected": -2.953125, "logps/chosen": -660.0, "logps/rejected": -1152.0, "loss": 0.2315, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.0, "rewards/rejected": -9.8125, "step": 8890 }, { "epoch": 0.6158743339561276, "grad_norm": 35.013659155861184, "learning_rate": 1.9304164259126472e-07, "logits/chosen": -2.953125, "logits/rejected": -3.109375, "logps/chosen": -684.0, "logps/rejected": -1144.0, "loss": 0.2564, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.71875, "rewards/rejected": -9.8125, "step": 8900 }, { "epoch": 0.6165663275897861, "grad_norm": 28.352804853596965, "learning_rate": 1.924537722235149e-07, "logits/chosen": -3.0, "logits/rejected": -3.265625, "logps/chosen": -676.0, "logps/rejected": -1136.0, "loss": 0.2499, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 4.78125, "rewards/rejected": -9.75, "step": 8910 }, { "epoch": 0.6172583212234447, "grad_norm": 20.82702952197283, "learning_rate": 1.918662376673555e-07, "logits/chosen": -2.84375, "logits/rejected": -2.875, "logps/chosen": -636.0, "logps/rejected": -1160.0, "loss": 0.2112, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.21875, "rewards/rejected": -9.875, "step": 8920 }, { "epoch": 0.6179503148571033, "grad_norm": 25.42760324926343, "learning_rate": 1.912790423513503e-07, "logits/chosen": -2.90625, "logits/rejected": -3.25, "logps/chosen": -624.0, "logps/rejected": -1056.0, "loss": 0.2241, "rewards/accuracies": 0.9375, "rewards/chosen": -4.5625, "rewards/margins": 4.5, "rewards/rejected": -9.0625, "step": 8930 }, { "epoch": 0.6186423084907618, "grad_norm": 14.72185744993456, "learning_rate": 1.906921897020834e-07, "logits/chosen": -2.859375, "logits/rejected": -3.03125, "logps/chosen": -640.0, "logps/rejected": -1152.0, "loss": 0.2088, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.09375, "rewards/rejected": -9.75, "step": 8940 }, { "epoch": 0.6193343021244204, "grad_norm": 19.016933984676164, "learning_rate": 1.901056831441393e-07, "logits/chosen": -2.859375, "logits/rejected": -3.078125, "logps/chosen": -636.0, "logps/rejected": -1152.0, "loss": 0.1919, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.21875, "rewards/rejected": -9.8125, "step": 8950 }, { "epoch": 0.620026295758079, "grad_norm": 20.01415435051089, "learning_rate": 1.895195261000831e-07, "logits/chosen": -2.875, "logits/rejected": -3.203125, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.2173, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.09375, "rewards/rejected": -9.9375, "step": 8960 }, { "epoch": 0.6207182893917376, "grad_norm": 24.95887414990734, "learning_rate": 1.8893372199043995e-07, "logits/chosen": -2.953125, "logits/rejected": -3.25, "logps/chosen": -644.0, "logps/rejected": -1104.0, "loss": 0.2309, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.875, "rewards/margins": 4.65625, "rewards/rejected": -9.5, "step": 8970 }, { "epoch": 0.6214102830253961, "grad_norm": 20.266685409945865, "learning_rate": 1.8834827423367567e-07, "logits/chosen": -2.953125, "logits/rejected": -3.265625, "logps/chosen": -652.0, "logps/rejected": -1120.0, "loss": 0.2329, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.71875, "rewards/margins": 4.96875, "rewards/rejected": -9.6875, "step": 8980 }, { "epoch": 0.6221022766590547, "grad_norm": 24.425491428830256, "learning_rate": 1.8776318624617644e-07, "logits/chosen": -2.953125, "logits/rejected": -3.203125, "logps/chosen": -624.0, "logps/rejected": -1152.0, "loss": 0.1961, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.65625, "rewards/margins": 5.34375, "rewards/rejected": -10.0, "step": 8990 }, { "epoch": 0.6227942702927133, "grad_norm": 10.97333304996101, "learning_rate": 1.871784614422293e-07, "logits/chosen": -2.828125, "logits/rejected": -3.078125, "logps/chosen": -624.0, "logps/rejected": -1104.0, "loss": 0.263, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.46875, "rewards/margins": 4.90625, "rewards/rejected": -9.375, "step": 9000 }, { "epoch": 0.6227942702927133, "eval_logits/chosen": -2.875, "eval_logits/rejected": -3.109375, "eval_logps/chosen": -664.0, "eval_logps/rejected": -1064.0, "eval_loss": 0.22911322116851807, "eval_rewards/accuracies": 0.8979725241661072, "eval_rewards/chosen": -4.75, "eval_rewards/margins": 4.21875, "eval_rewards/rejected": -9.0, "eval_runtime": 2934.2294, "eval_samples_per_second": 33.342, "eval_steps_per_second": 0.521, "step": 9000 }, { "epoch": 0.6234862639263719, "grad_norm": 18.087623259172567, "learning_rate": 1.8659410323400152e-07, "logits/chosen": -2.921875, "logits/rejected": -3.1875, "logps/chosen": -624.0, "logps/rejected": -1080.0, "loss": 0.1817, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.59375, "rewards/margins": 4.625, "rewards/rejected": -9.1875, "step": 9010 }, { "epoch": 0.6241782575600304, "grad_norm": 20.090836021647274, "learning_rate": 1.8601011503152135e-07, "logits/chosen": -2.96875, "logits/rejected": -3.15625, "logps/chosen": -620.0, "logps/rejected": -1144.0, "loss": 0.2245, "rewards/accuracies": 0.96875, "rewards/chosen": -4.5, "rewards/margins": 5.28125, "rewards/rejected": -9.75, "step": 9020 }, { "epoch": 0.624870251193689, "grad_norm": 15.171868057826108, "learning_rate": 1.8542650024265794e-07, "logits/chosen": -2.875, "logits/rejected": -3.140625, "logps/chosen": -612.0, "logps/rejected": -1128.0, "loss": 0.227, "rewards/accuracies": 0.96875, "rewards/chosen": -4.40625, "rewards/margins": 5.34375, "rewards/rejected": -9.75, "step": 9030 }, { "epoch": 0.6255622448273476, "grad_norm": 20.256861724372182, "learning_rate": 1.8484326227310098e-07, "logits/chosen": -2.84375, "logits/rejected": -3.125, "logps/chosen": -648.0, "logps/rejected": -1136.0, "loss": 0.2156, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 5.09375, "rewards/rejected": -9.8125, "step": 9040 }, { "epoch": 0.6262542384610061, "grad_norm": 23.596382425939314, "learning_rate": 1.8426040452634174e-07, "logits/chosen": -2.984375, "logits/rejected": -3.3125, "logps/chosen": -688.0, "logps/rejected": -1168.0, "loss": 0.2462, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 9050 }, { "epoch": 0.6269462320946647, "grad_norm": 25.246391412902437, "learning_rate": 1.8367793040365244e-07, "logits/chosen": -3.09375, "logits/rejected": -3.34375, "logps/chosen": -640.0, "logps/rejected": -1120.0, "loss": 0.2204, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 4.875, "rewards/rejected": -9.6875, "step": 9060 }, { "epoch": 0.6276382257283233, "grad_norm": 18.982614474254984, "learning_rate": 1.830958433040665e-07, "logits/chosen": -2.890625, "logits/rejected": -3.296875, "logps/chosen": -636.0, "logps/rejected": -1128.0, "loss": 0.2158, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.625, "rewards/margins": 5.21875, "rewards/rejected": -9.8125, "step": 9070 }, { "epoch": 0.6283302193619819, "grad_norm": 16.680052341076998, "learning_rate": 1.8251414662435917e-07, "logits/chosen": -3.015625, "logits/rejected": -3.203125, "logps/chosen": -632.0, "logps/rejected": -1136.0, "loss": 0.2148, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.15625, "rewards/rejected": -9.8125, "step": 9080 }, { "epoch": 0.6290222129956404, "grad_norm": 22.265623061005073, "learning_rate": 1.819328437590274e-07, "logits/chosen": -3.0, "logits/rejected": -3.375, "logps/chosen": -624.0, "logps/rejected": -1080.0, "loss": 0.2671, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 4.78125, "rewards/rejected": -9.3125, "step": 9090 }, { "epoch": 0.629714206629299, "grad_norm": 29.70696034328868, "learning_rate": 1.813519381002696e-07, "logits/chosen": -2.9375, "logits/rejected": -3.125, "logps/chosen": -668.0, "logps/rejected": -1184.0, "loss": 0.2185, "rewards/accuracies": 0.96875, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 9100 }, { "epoch": 0.6304062002629576, "grad_norm": 13.82707475162408, "learning_rate": 1.8077143303796678e-07, "logits/chosen": -2.859375, "logits/rejected": -3.25, "logps/chosen": -656.0, "logps/rejected": -1120.0, "loss": 0.2236, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.875, "rewards/rejected": -9.5625, "step": 9110 }, { "epoch": 0.6310981938966161, "grad_norm": 23.459338278654204, "learning_rate": 1.8019133195966206e-07, "logits/chosen": -2.96875, "logits/rejected": -3.109375, "logps/chosen": -700.0, "logps/rejected": -1152.0, "loss": 0.2228, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.125, "rewards/margins": 4.6875, "rewards/rejected": -9.8125, "step": 9120 }, { "epoch": 0.6317901875302747, "grad_norm": 22.032705821671463, "learning_rate": 1.7961163825054098e-07, "logits/chosen": -3.0, "logits/rejected": -3.046875, "logps/chosen": -680.0, "logps/rejected": -1144.0, "loss": 0.2066, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.625, "rewards/rejected": -9.6875, "step": 9130 }, { "epoch": 0.6324821811639333, "grad_norm": 24.610154345083167, "learning_rate": 1.790323552934121e-07, "logits/chosen": -2.9375, "logits/rejected": -3.234375, "logps/chosen": -656.0, "logps/rejected": -1136.0, "loss": 0.2188, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71875, "rewards/margins": 5.0, "rewards/rejected": -9.6875, "step": 9140 }, { "epoch": 0.6331741747975919, "grad_norm": 18.41900487863645, "learning_rate": 1.7845348646868697e-07, "logits/chosen": -2.9375, "logits/rejected": -3.265625, "logps/chosen": -700.0, "logps/rejected": -1168.0, "loss": 0.2496, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.96875, "rewards/rejected": -10.0625, "step": 9150 }, { "epoch": 0.6338661684312504, "grad_norm": 21.002310962105955, "learning_rate": 1.7787503515436036e-07, "logits/chosen": -3.046875, "logits/rejected": -3.25, "logps/chosen": -652.0, "logps/rejected": -1176.0, "loss": 0.2371, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.15625, "rewards/rejected": -10.0625, "step": 9160 }, { "epoch": 0.634558162064909, "grad_norm": 23.132247476121066, "learning_rate": 1.7729700472599077e-07, "logits/chosen": -3.03125, "logits/rejected": -3.3125, "logps/chosen": -656.0, "logps/rejected": -1096.0, "loss": 0.2477, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.78125, "rewards/margins": 4.6875, "rewards/rejected": -9.4375, "step": 9170 }, { "epoch": 0.6352501556985676, "grad_norm": 20.55965509769093, "learning_rate": 1.767193985566806e-07, "logits/chosen": -2.859375, "logits/rejected": -3.265625, "logps/chosen": -636.0, "logps/rejected": -1168.0, "loss": 0.2275, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 5.5, "rewards/rejected": -10.0625, "step": 9180 }, { "epoch": 0.6359421493322261, "grad_norm": 28.37994908423175, "learning_rate": 1.761422200170563e-07, "logits/chosen": -3.0625, "logits/rejected": -3.296875, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.226, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 4.8125, "rewards/rejected": -9.75, "step": 9190 }, { "epoch": 0.6366341429658847, "grad_norm": 16.404464378791918, "learning_rate": 1.755654724752492e-07, "logits/chosen": -2.9375, "logits/rejected": -3.0625, "logps/chosen": -676.0, "logps/rejected": -1144.0, "loss": 0.2564, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.65625, "rewards/rejected": -9.625, "step": 9200 }, { "epoch": 0.6373261365995433, "grad_norm": 20.094472550617827, "learning_rate": 1.7498915929687541e-07, "logits/chosen": -2.9375, "logits/rejected": -3.34375, "logps/chosen": -636.0, "logps/rejected": -1152.0, "loss": 0.2417, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 5.28125, "rewards/rejected": -9.9375, "step": 9210 }, { "epoch": 0.6380181302332019, "grad_norm": 23.66660729784941, "learning_rate": 1.744132838450161e-07, "logits/chosen": -2.921875, "logits/rejected": -3.25, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.2279, "rewards/accuracies": 0.90625, "rewards/chosen": -4.875, "rewards/margins": 4.8125, "rewards/rejected": -9.6875, "step": 9220 }, { "epoch": 0.6387101238668604, "grad_norm": 17.83497960056337, "learning_rate": 1.738378494801983e-07, "logits/chosen": -2.96875, "logits/rejected": -3.28125, "logps/chosen": -636.0, "logps/rejected": -1192.0, "loss": 0.2369, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.625, "rewards/margins": 5.65625, "rewards/rejected": -10.25, "step": 9230 }, { "epoch": 0.639402117500519, "grad_norm": 16.789511908275376, "learning_rate": 1.7326285956037524e-07, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2044, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.65625, "rewards/margins": 5.15625, "rewards/rejected": -9.8125, "step": 9240 }, { "epoch": 0.6400941111341776, "grad_norm": 24.848434181636105, "learning_rate": 1.72688317440906e-07, "logits/chosen": -2.9375, "logits/rejected": -3.390625, "logps/chosen": -628.0, "logps/rejected": -1160.0, "loss": 0.2133, "rewards/accuracies": 0.96875, "rewards/chosen": -4.4375, "rewards/margins": 5.5625, "rewards/rejected": -10.0, "step": 9250 }, { "epoch": 0.6407861047678362, "grad_norm": 21.234238312331755, "learning_rate": 1.7211422647453716e-07, "logits/chosen": -2.875, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1112.0, "loss": 0.2167, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.5, "rewards/margins": 5.0, "rewards/rejected": -9.5, "step": 9260 }, { "epoch": 0.6414780984014947, "grad_norm": 25.894902720874786, "learning_rate": 1.7154059001138233e-07, "logits/chosen": -2.9375, "logits/rejected": -3.3125, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2204, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.125, "rewards/rejected": -9.875, "step": 9270 }, { "epoch": 0.6421700920351533, "grad_norm": 19.696630287900202, "learning_rate": 1.7096741139890274e-07, "logits/chosen": -2.890625, "logits/rejected": -3.203125, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.2366, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.125, "rewards/rejected": -9.875, "step": 9280 }, { "epoch": 0.6428620856688119, "grad_norm": 19.050953573360662, "learning_rate": 1.7039469398188788e-07, "logits/chosen": -2.875, "logits/rejected": -3.171875, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2094, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.25, "rewards/rejected": -10.0625, "step": 9290 }, { "epoch": 0.6435540793024704, "grad_norm": 18.328545410716295, "learning_rate": 1.6982244110243623e-07, "logits/chosen": -2.890625, "logits/rejected": -3.0625, "logps/chosen": -664.0, "logps/rejected": -1112.0, "loss": 0.2292, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.59375, "rewards/rejected": -9.5625, "step": 9300 }, { "epoch": 0.644246072936129, "grad_norm": 19.827754453090787, "learning_rate": 1.6925065609993502e-07, "logits/chosen": -2.953125, "logits/rejected": -3.375, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.2125, "rewards/accuracies": 0.9375, "rewards/chosen": -4.8125, "rewards/margins": 5.25, "rewards/rejected": -10.0625, "step": 9310 }, { "epoch": 0.6449380665697876, "grad_norm": 30.503106487893675, "learning_rate": 1.6867934231104145e-07, "logits/chosen": -2.96875, "logits/rejected": -3.40625, "logps/chosen": -652.0, "logps/rejected": -1168.0, "loss": 0.1979, "rewards/accuracies": 0.96875, "rewards/chosen": -4.84375, "rewards/margins": 5.28125, "rewards/rejected": -10.125, "step": 9320 }, { "epoch": 0.6456300602034462, "grad_norm": 20.675631347200923, "learning_rate": 1.681085030696629e-07, "logits/chosen": -2.96875, "logits/rejected": -3.203125, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.254, "rewards/accuracies": 0.9375, "rewards/chosen": -4.78125, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 9330 }, { "epoch": 0.6463220538371047, "grad_norm": 19.122115128347893, "learning_rate": 1.6753814170693736e-07, "logits/chosen": -2.9375, "logits/rejected": -3.25, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.2657, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 9340 }, { "epoch": 0.6470140474707633, "grad_norm": 24.710609722914246, "learning_rate": 1.6696826155121446e-07, "logits/chosen": -2.953125, "logits/rejected": -3.34375, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2467, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 5.40625, "rewards/rejected": -10.1875, "step": 9350 }, { "epoch": 0.6477060411044219, "grad_norm": 19.73275078811698, "learning_rate": 1.6639886592803555e-07, "logits/chosen": -2.9375, "logits/rejected": -3.28125, "logps/chosen": -632.0, "logps/rejected": -1104.0, "loss": 0.246, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 4.90625, "rewards/rejected": -9.5, "step": 9360 }, { "epoch": 0.6483980347380804, "grad_norm": 22.282709037167567, "learning_rate": 1.658299581601144e-07, "logits/chosen": -2.828125, "logits/rejected": -3.109375, "logps/chosen": -632.0, "logps/rejected": -1144.0, "loss": 0.2468, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.625, "rewards/margins": 5.09375, "rewards/rejected": -9.75, "step": 9370 }, { "epoch": 0.649090028371739, "grad_norm": 23.40229798419591, "learning_rate": 1.6526154156731804e-07, "logits/chosen": -2.96875, "logits/rejected": -3.09375, "logps/chosen": -628.0, "logps/rejected": -1160.0, "loss": 0.2237, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 9380 }, { "epoch": 0.6497820220053976, "grad_norm": 23.434401673943334, "learning_rate": 1.646936194666474e-07, "logits/chosen": -2.953125, "logits/rejected": -3.265625, "logps/chosen": -648.0, "logps/rejected": -1080.0, "loss": 0.2338, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.75, "rewards/margins": 4.53125, "rewards/rejected": -9.25, "step": 9390 }, { "epoch": 0.6504740156390562, "grad_norm": 21.57976385677066, "learning_rate": 1.6412619517221727e-07, "logits/chosen": -2.90625, "logits/rejected": -3.359375, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.2313, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.96875, "rewards/rejected": -9.75, "step": 9400 }, { "epoch": 0.6511660092727147, "grad_norm": 14.518866879494482, "learning_rate": 1.6355927199523806e-07, "logits/chosen": -2.921875, "logits/rejected": -3.15625, "logps/chosen": -632.0, "logps/rejected": -1160.0, "loss": 0.1938, "rewards/accuracies": 0.96875, "rewards/chosen": -4.5625, "rewards/margins": 5.46875, "rewards/rejected": -10.0625, "step": 9410 }, { "epoch": 0.6518580029063733, "grad_norm": 23.85065285925852, "learning_rate": 1.6299285324399565e-07, "logits/chosen": -3.03125, "logits/rejected": -3.234375, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2381, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.875, "rewards/margins": 4.8125, "rewards/rejected": -9.6875, "step": 9420 }, { "epoch": 0.6525499965400319, "grad_norm": 19.047607374459144, "learning_rate": 1.6242694222383225e-07, "logits/chosen": -2.875, "logits/rejected": -3.3125, "logps/chosen": -672.0, "logps/rejected": -1120.0, "loss": 0.255, "rewards/accuracies": 0.90625, "rewards/chosen": -4.96875, "rewards/margins": 4.6875, "rewards/rejected": -9.6875, "step": 9430 }, { "epoch": 0.6532419901736904, "grad_norm": 18.77255500794696, "learning_rate": 1.6186154223712733e-07, "logits/chosen": -2.953125, "logits/rejected": -3.203125, "logps/chosen": -632.0, "logps/rejected": -1168.0, "loss": 0.2139, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.59375, "rewards/margins": 5.34375, "rewards/rejected": -9.9375, "step": 9440 }, { "epoch": 0.653933983807349, "grad_norm": 31.40753143010121, "learning_rate": 1.6129665658327828e-07, "logits/chosen": -3.015625, "logits/rejected": -3.3125, "logps/chosen": -640.0, "logps/rejected": -1128.0, "loss": 0.2331, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.03125, "rewards/rejected": -9.75, "step": 9450 }, { "epoch": 0.6546259774410076, "grad_norm": 24.6259115949581, "learning_rate": 1.6073228855868096e-07, "logits/chosen": -2.953125, "logits/rejected": -3.265625, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2524, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 4.90625, "rewards/rejected": -9.9375, "step": 9460 }, { "epoch": 0.6553179710746662, "grad_norm": 31.393344952356227, "learning_rate": 1.601684414567106e-07, "logits/chosen": -2.8125, "logits/rejected": -3.046875, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2291, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.625, "rewards/margins": 5.125, "rewards/rejected": -9.75, "step": 9470 }, { "epoch": 0.6560099647083247, "grad_norm": 12.628743950998377, "learning_rate": 1.5960511856770275e-07, "logits/chosen": -2.875, "logits/rejected": -3.265625, "logps/chosen": -672.0, "logps/rejected": -1096.0, "loss": 0.2365, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.5625, "rewards/rejected": -9.375, "step": 9480 }, { "epoch": 0.6567019583419833, "grad_norm": 21.465940515569994, "learning_rate": 1.5904232317893358e-07, "logits/chosen": -2.8125, "logits/rejected": -2.90625, "logps/chosen": -692.0, "logps/rejected": -1160.0, "loss": 0.2551, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 9490 }, { "epoch": 0.6573939519756419, "grad_norm": 22.727548602573215, "learning_rate": 1.584800585746014e-07, "logits/chosen": -2.828125, "logits/rejected": -3.265625, "logps/chosen": -652.0, "logps/rejected": -1120.0, "loss": 0.2351, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 4.84375, "rewards/rejected": -9.625, "step": 9500 }, { "epoch": 0.6580859456093004, "grad_norm": 15.29391647932165, "learning_rate": 1.5791832803580702e-07, "logits/chosen": -2.875, "logits/rejected": -2.921875, "logps/chosen": -648.0, "logps/rejected": -1104.0, "loss": 0.241, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.78125, "rewards/margins": 4.78125, "rewards/rejected": -9.5625, "step": 9510 }, { "epoch": 0.6587779392429589, "grad_norm": 15.465726285041782, "learning_rate": 1.573571348405344e-07, "logits/chosen": -2.875, "logits/rejected": -3.046875, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.2232, "rewards/accuracies": 0.9375, "rewards/chosen": -4.78125, "rewards/margins": 4.90625, "rewards/rejected": -9.6875, "step": 9520 }, { "epoch": 0.6594699328766175, "grad_norm": 17.36208991174216, "learning_rate": 1.5679648226363225e-07, "logits/chosen": -3.125, "logits/rejected": -3.28125, "logps/chosen": -648.0, "logps/rejected": -1216.0, "loss": 0.2042, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.65625, "rewards/rejected": -10.375, "step": 9530 }, { "epoch": 0.660161926510276, "grad_norm": 19.674403451526864, "learning_rate": 1.5623637357679442e-07, "logits/chosen": -2.890625, "logits/rejected": -3.296875, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2204, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 5.25, "rewards/rejected": -10.1875, "step": 9540 }, { "epoch": 0.6608539201439346, "grad_norm": 24.858938157363795, "learning_rate": 1.5567681204854047e-07, "logits/chosen": -2.9375, "logits/rejected": -3.203125, "logps/chosen": -660.0, "logps/rejected": -1152.0, "loss": 0.257, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 9550 }, { "epoch": 0.6615459137775932, "grad_norm": 19.946787950891082, "learning_rate": 1.5511780094419751e-07, "logits/chosen": -2.984375, "logits/rejected": -3.34375, "logps/chosen": -656.0, "logps/rejected": -1120.0, "loss": 0.2279, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.78125, "rewards/rejected": -9.5625, "step": 9560 }, { "epoch": 0.6622379074112518, "grad_norm": 25.207923780043593, "learning_rate": 1.545593435258805e-07, "logits/chosen": -2.96875, "logits/rejected": -3.046875, "logps/chosen": -656.0, "logps/rejected": -1192.0, "loss": 0.2499, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.40625, "rewards/rejected": -10.25, "step": 9570 }, { "epoch": 0.6629299010449103, "grad_norm": 27.44590098418269, "learning_rate": 1.5400144305247324e-07, "logits/chosen": -2.890625, "logits/rejected": -3.21875, "logps/chosen": -700.0, "logps/rejected": -1152.0, "loss": 0.248, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.28125, "rewards/margins": 4.5625, "rewards/rejected": -9.875, "step": 9580 }, { "epoch": 0.6636218946785689, "grad_norm": 27.425463487097183, "learning_rate": 1.5344410277960945e-07, "logits/chosen": -2.890625, "logits/rejected": -3.390625, "logps/chosen": -628.0, "logps/rejected": -1104.0, "loss": 0.1982, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.46875, "rewards/margins": 5.0625, "rewards/rejected": -9.5625, "step": 9590 }, { "epoch": 0.6643138883122275, "grad_norm": 25.81911786368618, "learning_rate": 1.5288732595965416e-07, "logits/chosen": -2.875, "logits/rejected": -3.3125, "logps/chosen": -656.0, "logps/rejected": -1128.0, "loss": 0.2442, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 4.9375, "rewards/rejected": -9.6875, "step": 9600 }, { "epoch": 0.6650058819458861, "grad_norm": 19.988683126053882, "learning_rate": 1.5233111584168384e-07, "logits/chosen": -3.078125, "logits/rejected": -3.421875, "logps/chosen": -636.0, "logps/rejected": -1096.0, "loss": 0.2148, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 4.71875, "rewards/rejected": -9.375, "step": 9610 }, { "epoch": 0.6656978755795446, "grad_norm": 18.576679607450725, "learning_rate": 1.5177547567146832e-07, "logits/chosen": -2.96875, "logits/rejected": -3.234375, "logps/chosen": -644.0, "logps/rejected": -1144.0, "loss": 0.2309, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.625, "rewards/margins": 5.3125, "rewards/rejected": -9.9375, "step": 9620 }, { "epoch": 0.6663898692132032, "grad_norm": 16.94182709519211, "learning_rate": 1.5122040869145142e-07, "logits/chosen": -2.921875, "logits/rejected": -3.21875, "logps/chosen": -656.0, "logps/rejected": -1120.0, "loss": 0.2406, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75, "rewards/margins": 4.6875, "rewards/rejected": -9.4375, "step": 9630 }, { "epoch": 0.6670818628468618, "grad_norm": 16.765464853267293, "learning_rate": 1.5066591814073198e-07, "logits/chosen": -2.796875, "logits/rejected": -3.296875, "logps/chosen": -640.0, "logps/rejected": -1112.0, "loss": 0.2049, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.4375, "rewards/margins": 5.03125, "rewards/rejected": -9.4375, "step": 9640 }, { "epoch": 0.6677738564805203, "grad_norm": 18.07625301707142, "learning_rate": 1.501120072550453e-07, "logits/chosen": -3.0, "logits/rejected": -3.203125, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.2785, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 5.0, "rewards/rejected": -9.875, "step": 9650 }, { "epoch": 0.6684658501141789, "grad_norm": 19.38912391276163, "learning_rate": 1.4955867926674388e-07, "logits/chosen": -2.921875, "logits/rejected": -3.328125, "logps/chosen": -644.0, "logps/rejected": -1120.0, "loss": 0.2319, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.0625, "rewards/rejected": -9.625, "step": 9660 }, { "epoch": 0.6691578437478375, "grad_norm": 16.160837281008103, "learning_rate": 1.490059374047788e-07, "logits/chosen": -3.03125, "logits/rejected": -3.328125, "logps/chosen": -624.0, "logps/rejected": -1128.0, "loss": 0.1964, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.5625, "rewards/margins": 5.09375, "rewards/rejected": -9.625, "step": 9670 }, { "epoch": 0.6698498373814961, "grad_norm": 30.974462411385037, "learning_rate": 1.4845378489468059e-07, "logits/chosen": -3.15625, "logits/rejected": -3.453125, "logps/chosen": -652.0, "logps/rejected": -1168.0, "loss": 0.2307, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 9680 }, { "epoch": 0.6705418310151546, "grad_norm": 20.67050158328596, "learning_rate": 1.4790222495854088e-07, "logits/chosen": -2.921875, "logits/rejected": -3.390625, "logps/chosen": -684.0, "logps/rejected": -1128.0, "loss": 0.2598, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.125, "rewards/margins": 4.65625, "rewards/rejected": -9.8125, "step": 9690 }, { "epoch": 0.6712338246488132, "grad_norm": 19.697703740626118, "learning_rate": 1.473512608149933e-07, "logits/chosen": -2.96875, "logits/rejected": -3.265625, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.213, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.0, "rewards/rejected": -9.875, "step": 9700 }, { "epoch": 0.6719258182824718, "grad_norm": 32.881084118936045, "learning_rate": 1.4680089567919452e-07, "logits/chosen": -3.109375, "logits/rejected": -3.390625, "logps/chosen": -684.0, "logps/rejected": -1160.0, "loss": 0.2505, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 4.90625, "rewards/rejected": -10.0625, "step": 9710 }, { "epoch": 0.6726178119161303, "grad_norm": 19.942683426970493, "learning_rate": 1.4625113276280577e-07, "logits/chosen": -2.828125, "logits/rejected": -3.171875, "logps/chosen": -612.0, "logps/rejected": -1104.0, "loss": 0.2261, "rewards/accuracies": 0.96875, "rewards/chosen": -4.375, "rewards/margins": 5.09375, "rewards/rejected": -9.4375, "step": 9720 }, { "epoch": 0.6733098055497889, "grad_norm": 26.54186161123292, "learning_rate": 1.4570197527397435e-07, "logits/chosen": -2.96875, "logits/rejected": -3.171875, "logps/chosen": -644.0, "logps/rejected": -1112.0, "loss": 0.2327, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.65625, "rewards/margins": 4.84375, "rewards/rejected": -9.5, "step": 9730 }, { "epoch": 0.6740017991834475, "grad_norm": 23.34263750379134, "learning_rate": 1.4515342641731398e-07, "logits/chosen": -2.96875, "logits/rejected": -3.046875, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2234, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.0, "rewards/rejected": -9.875, "step": 9740 }, { "epoch": 0.6746937928171061, "grad_norm": 27.138418234384915, "learning_rate": 1.446054893938871e-07, "logits/chosen": -2.90625, "logits/rejected": -2.984375, "logps/chosen": -676.0, "logps/rejected": -1120.0, "loss": 0.2497, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.84375, "rewards/margins": 4.59375, "rewards/rejected": -9.4375, "step": 9750 }, { "epoch": 0.6753857864507646, "grad_norm": 22.22976902332629, "learning_rate": 1.4405816740118593e-07, "logits/chosen": -3.015625, "logits/rejected": -3.1875, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.2027, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 9760 }, { "epoch": 0.6760777800844232, "grad_norm": 27.180426772574712, "learning_rate": 1.4351146363311322e-07, "logits/chosen": -3.015625, "logits/rejected": -3.203125, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2047, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 9770 }, { "epoch": 0.6767697737180818, "grad_norm": 19.252661456591316, "learning_rate": 1.429653812799644e-07, "logits/chosen": -2.84375, "logits/rejected": -3.125, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.2301, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 5.1875, "rewards/rejected": -9.8125, "step": 9780 }, { "epoch": 0.6774617673517404, "grad_norm": 18.291455710155848, "learning_rate": 1.424199235284085e-07, "logits/chosen": -2.96875, "logits/rejected": -3.296875, "logps/chosen": -688.0, "logps/rejected": -1136.0, "loss": 0.2398, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 4.625, "rewards/rejected": -9.75, "step": 9790 }, { "epoch": 0.6781537609853989, "grad_norm": 26.20509815216147, "learning_rate": 1.4187509356146967e-07, "logits/chosen": -3.125, "logits/rejected": -3.359375, "logps/chosen": -676.0, "logps/rejected": -1144.0, "loss": 0.2425, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.8125, "rewards/rejected": -9.875, "step": 9800 }, { "epoch": 0.6788457546190575, "grad_norm": 42.439800040150914, "learning_rate": 1.4133089455850877e-07, "logits/chosen": -2.84375, "logits/rejected": -3.15625, "logps/chosen": -688.0, "logps/rejected": -1152.0, "loss": 0.2358, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.71875, "rewards/rejected": -9.6875, "step": 9810 }, { "epoch": 0.6795377482527161, "grad_norm": 30.11752156261635, "learning_rate": 1.4078732969520447e-07, "logits/chosen": -2.90625, "logits/rejected": -3.140625, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2224, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.15625, "rewards/rejected": -9.875, "step": 9820 }, { "epoch": 0.6802297418863746, "grad_norm": 24.838482654504624, "learning_rate": 1.40244402143535e-07, "logits/chosen": -2.78125, "logits/rejected": -3.234375, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.227, "rewards/accuracies": 0.96875, "rewards/chosen": -4.53125, "rewards/margins": 5.25, "rewards/rejected": -9.75, "step": 9830 }, { "epoch": 0.6809217355200332, "grad_norm": 27.683372393868037, "learning_rate": 1.3970211507175953e-07, "logits/chosen": -2.984375, "logits/rejected": -3.21875, "logps/chosen": -632.0, "logps/rejected": -1096.0, "loss": 0.2213, "rewards/accuracies": 0.9375, "rewards/chosen": -4.625, "rewards/margins": 4.75, "rewards/rejected": -9.375, "step": 9840 }, { "epoch": 0.6816137291536918, "grad_norm": 25.350293568675593, "learning_rate": 1.3916047164439987e-07, "logits/chosen": -3.03125, "logits/rejected": -3.21875, "logps/chosen": -656.0, "logps/rejected": -1200.0, "loss": 0.234, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.375, "rewards/rejected": -10.25, "step": 9850 }, { "epoch": 0.6823057227873504, "grad_norm": 18.32096333001813, "learning_rate": 1.386194750222213e-07, "logits/chosen": -3.0, "logits/rejected": -3.3125, "logps/chosen": -644.0, "logps/rejected": -1144.0, "loss": 0.236, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 9860 }, { "epoch": 0.6829977164210089, "grad_norm": 25.74606208934039, "learning_rate": 1.3807912836221542e-07, "logits/chosen": -2.96875, "logits/rejected": -3.203125, "logps/chosen": -640.0, "logps/rejected": -1208.0, "loss": 0.2121, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.59375, "rewards/margins": 5.6875, "rewards/rejected": -10.3125, "step": 9870 }, { "epoch": 0.6836897100546675, "grad_norm": 19.73545015421009, "learning_rate": 1.3753943481758056e-07, "logits/chosen": -2.9375, "logits/rejected": -3.109375, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2469, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.9375, "rewards/rejected": -9.9375, "step": 9880 }, { "epoch": 0.6843817036883261, "grad_norm": 30.32228766181577, "learning_rate": 1.3700039753770364e-07, "logits/chosen": -2.953125, "logits/rejected": -3.28125, "logps/chosen": -632.0, "logps/rejected": -1120.0, "loss": 0.2188, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.0, "rewards/rejected": -9.6875, "step": 9890 }, { "epoch": 0.6850736973219846, "grad_norm": 27.988356977116506, "learning_rate": 1.3646201966814214e-07, "logits/chosen": -3.03125, "logits/rejected": -3.3125, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.2363, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.21875, "rewards/rejected": -10.125, "step": 9900 }, { "epoch": 0.6857656909556432, "grad_norm": 21.098664024699396, "learning_rate": 1.3592430435060577e-07, "logits/chosen": -3.0, "logits/rejected": -3.171875, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.2401, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.125, "rewards/rejected": -9.875, "step": 9910 }, { "epoch": 0.6864576845893018, "grad_norm": 21.01559946532778, "learning_rate": 1.3538725472293737e-07, "logits/chosen": -3.0625, "logits/rejected": -3.328125, "logps/chosen": -652.0, "logps/rejected": -1208.0, "loss": 0.2213, "rewards/accuracies": 0.96875, "rewards/chosen": -4.84375, "rewards/margins": 5.40625, "rewards/rejected": -10.25, "step": 9920 }, { "epoch": 0.6871496782229604, "grad_norm": 23.51782639346904, "learning_rate": 1.3485087391909556e-07, "logits/chosen": -3.078125, "logits/rejected": -3.34375, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.2449, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 9930 }, { "epoch": 0.6878416718566189, "grad_norm": 20.815503729539717, "learning_rate": 1.3431516506913593e-07, "logits/chosen": -2.921875, "logits/rejected": -3.484375, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.2239, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.25, "rewards/rejected": -10.125, "step": 9940 }, { "epoch": 0.6885336654902775, "grad_norm": 13.905797464606824, "learning_rate": 1.3378013129919284e-07, "logits/chosen": -3.0625, "logits/rejected": -3.171875, "logps/chosen": -644.0, "logps/rejected": -1152.0, "loss": 0.2273, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84375, "rewards/margins": 5.0625, "rewards/rejected": -9.875, "step": 9950 }, { "epoch": 0.6892256591239361, "grad_norm": 27.33722941057412, "learning_rate": 1.3324577573146124e-07, "logits/chosen": -2.984375, "logits/rejected": -3.359375, "logps/chosen": -640.0, "logps/rejected": -1120.0, "loss": 0.24, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.96875, "rewards/rejected": -9.6875, "step": 9960 }, { "epoch": 0.6899176527575946, "grad_norm": 19.326255459604496, "learning_rate": 1.3271210148417838e-07, "logits/chosen": -2.875, "logits/rejected": -3.265625, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.1977, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 9970 }, { "epoch": 0.6906096463912532, "grad_norm": 28.48389162030399, "learning_rate": 1.3217911167160571e-07, "logits/chosen": -2.921875, "logits/rejected": -3.15625, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.2281, "rewards/accuracies": 0.9375, "rewards/chosen": -4.78125, "rewards/margins": 5.1875, "rewards/rejected": -9.9375, "step": 9980 }, { "epoch": 0.6913016400249118, "grad_norm": 27.156340787212255, "learning_rate": 1.3164680940401066e-07, "logits/chosen": -2.875, "logits/rejected": -3.046875, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.1962, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 9990 }, { "epoch": 0.6919936336585704, "grad_norm": 30.687632635145356, "learning_rate": 1.3111519778764852e-07, "logits/chosen": -3.015625, "logits/rejected": -3.328125, "logps/chosen": -652.0, "logps/rejected": -1120.0, "loss": 0.2417, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.84375, "rewards/margins": 4.78125, "rewards/rejected": -9.625, "step": 10000 }, { "epoch": 0.6919936336585704, "eval_logits/chosen": -2.90625, "eval_logits/rejected": -3.171875, "eval_logps/chosen": -684.0, "eval_logps/rejected": -1088.0, "eval_loss": 0.22805513441562653, "eval_rewards/accuracies": 0.8979725241661072, "eval_rewards/chosen": -4.96875, "eval_rewards/margins": 4.3125, "eval_rewards/rejected": -9.25, "eval_runtime": 2937.4526, "eval_samples_per_second": 33.305, "eval_steps_per_second": 0.521, "step": 10000 }, { "epoch": 0.6926856272922289, "grad_norm": 18.631506593411455, "learning_rate": 1.3058427992474396e-07, "logits/chosen": -2.890625, "logits/rejected": -3.15625, "logps/chosen": -624.0, "logps/rejected": -1152.0, "loss": 0.1963, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.65625, "rewards/margins": 5.28125, "rewards/rejected": -9.9375, "step": 10010 }, { "epoch": 0.6933776209258875, "grad_norm": 21.229014151869308, "learning_rate": 1.300540589134738e-07, "logits/chosen": -2.828125, "logits/rejected": -3.265625, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2115, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.53125, "rewards/rejected": -10.25, "step": 10020 }, { "epoch": 0.6940696145595461, "grad_norm": 22.226363259193008, "learning_rate": 1.2952453784794804e-07, "logits/chosen": -2.96875, "logits/rejected": -3.34375, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.90625, "rewards/rejected": -9.6875, "step": 10030 }, { "epoch": 0.6947616081932046, "grad_norm": 22.07183429341008, "learning_rate": 1.28995719818192e-07, "logits/chosen": -2.9375, "logits/rejected": -3.234375, "logps/chosen": -696.0, "logps/rejected": -1208.0, "loss": 0.2123, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.34375, "rewards/rejected": -10.3125, "step": 10040 }, { "epoch": 0.6954536018268632, "grad_norm": 22.34788973057751, "learning_rate": 1.2846760791012863e-07, "logits/chosen": -2.9375, "logits/rejected": -3.125, "logps/chosen": -668.0, "logps/rejected": -1160.0, "loss": 0.2536, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 10050 }, { "epoch": 0.6961455954605218, "grad_norm": 23.771013126904702, "learning_rate": 1.279402052055605e-07, "logits/chosen": -2.8125, "logits/rejected": -3.296875, "logps/chosen": -676.0, "logps/rejected": -1128.0, "loss": 0.2037, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.90625, "rewards/rejected": -9.6875, "step": 10060 }, { "epoch": 0.6968375890941804, "grad_norm": 9.211957937099834, "learning_rate": 1.2741351478215117e-07, "logits/chosen": -3.0625, "logits/rejected": -3.1875, "logps/chosen": -648.0, "logps/rejected": -1224.0, "loss": 0.199, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.5625, "rewards/rejected": -10.4375, "step": 10070 }, { "epoch": 0.6975295827278389, "grad_norm": 29.723295821287195, "learning_rate": 1.2688753971340795e-07, "logits/chosen": -3.0, "logits/rejected": -3.171875, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.1878, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.84375, "rewards/rejected": -9.6875, "step": 10080 }, { "epoch": 0.6982215763614975, "grad_norm": 21.313859711057788, "learning_rate": 1.263622830686637e-07, "logits/chosen": -2.96875, "logits/rejected": -3.109375, "logps/chosen": -632.0, "logps/rejected": -1160.0, "loss": 0.1963, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.59375, "rewards/margins": 5.28125, "rewards/rejected": -9.875, "step": 10090 }, { "epoch": 0.6989135699951561, "grad_norm": 24.997823932577212, "learning_rate": 1.258377479130588e-07, "logits/chosen": -2.875, "logits/rejected": -3.15625, "logps/chosen": -640.0, "logps/rejected": -1160.0, "loss": 0.1925, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 5.25, "rewards/rejected": -9.875, "step": 10100 }, { "epoch": 0.6996055636288147, "grad_norm": 20.470509898046316, "learning_rate": 1.2531393730752337e-07, "logits/chosen": -2.984375, "logits/rejected": -3.375, "logps/chosen": -640.0, "logps/rejected": -1160.0, "loss": 0.2202, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 10110 }, { "epoch": 0.7002975572624732, "grad_norm": 28.361456925778235, "learning_rate": 1.2479085430875957e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0625, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.1902, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.40625, "rewards/rejected": -10.125, "step": 10120 }, { "epoch": 0.7009895508961318, "grad_norm": 19.165382239841886, "learning_rate": 1.2426850196922346e-07, "logits/chosen": -3.078125, "logits/rejected": -3.28125, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.2211, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 4.9375, "rewards/rejected": -9.8125, "step": 10130 }, { "epoch": 0.7016815445297904, "grad_norm": 16.143912037585505, "learning_rate": 1.237468833371073e-07, "logits/chosen": -2.96875, "logits/rejected": -3.125, "logps/chosen": -676.0, "logps/rejected": -1184.0, "loss": 0.2664, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 4.96875, "rewards/rejected": -9.9375, "step": 10140 }, { "epoch": 0.7023735381634489, "grad_norm": 21.92877283233021, "learning_rate": 1.2322600145632203e-07, "logits/chosen": -2.75, "logits/rejected": -3.078125, "logps/chosen": -676.0, "logps/rejected": -1208.0, "loss": 0.1987, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.65625, "rewards/rejected": -10.4375, "step": 10150 }, { "epoch": 0.7030655317971075, "grad_norm": 14.303598769513295, "learning_rate": 1.2270585936647876e-07, "logits/chosen": -2.96875, "logits/rejected": -3.234375, "logps/chosen": -668.0, "logps/rejected": -1136.0, "loss": 0.2446, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.0625, "rewards/rejected": -9.875, "step": 10160 }, { "epoch": 0.7037575254307661, "grad_norm": 19.674474144066316, "learning_rate": 1.2218646010287224e-07, "logits/chosen": -2.9375, "logits/rejected": -3.140625, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2485, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 5.0, "rewards/rejected": -9.9375, "step": 10170 }, { "epoch": 0.7044495190644247, "grad_norm": 22.184392493364196, "learning_rate": 1.2166780669646197e-07, "logits/chosen": -2.953125, "logits/rejected": -3.453125, "logps/chosen": -648.0, "logps/rejected": -1136.0, "loss": 0.2587, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.03125, "rewards/rejected": -9.75, "step": 10180 }, { "epoch": 0.7051415126980832, "grad_norm": 25.119474768515364, "learning_rate": 1.2114990217385506e-07, "logits/chosen": -2.9375, "logits/rejected": -3.296875, "logps/chosen": -640.0, "logps/rejected": -1088.0, "loss": 0.2122, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.71875, "rewards/rejected": -9.4375, "step": 10190 }, { "epoch": 0.7058335063317418, "grad_norm": 16.30791124487485, "learning_rate": 1.2063274955728847e-07, "logits/chosen": -2.984375, "logits/rejected": -3.15625, "logps/chosen": -644.0, "logps/rejected": -1144.0, "loss": 0.207, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.0625, "rewards/rejected": -9.8125, "step": 10200 }, { "epoch": 0.7065254999654004, "grad_norm": 22.534645982627293, "learning_rate": 1.201163518646118e-07, "logits/chosen": -2.9375, "logits/rejected": -3.3125, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.2208, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84375, "rewards/margins": 5.0, "rewards/rejected": -9.8125, "step": 10210 }, { "epoch": 0.7072174935990589, "grad_norm": 16.518866201709717, "learning_rate": 1.1960071210926868e-07, "logits/chosen": -3.03125, "logits/rejected": -3.46875, "logps/chosen": -632.0, "logps/rejected": -1136.0, "loss": 0.2316, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 5.1875, "rewards/rejected": -9.75, "step": 10220 }, { "epoch": 0.7079094872327174, "grad_norm": 22.19263721110402, "learning_rate": 1.1908583330028018e-07, "logits/chosen": -3.046875, "logits/rejected": -3.09375, "logps/chosen": -640.0, "logps/rejected": -1224.0, "loss": 0.2055, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.6875, "rewards/margins": 5.8125, "rewards/rejected": -10.5, "step": 10230 }, { "epoch": 0.708601480866376, "grad_norm": 16.407300610842984, "learning_rate": 1.1857171844222676e-07, "logits/chosen": -2.9375, "logits/rejected": -3.15625, "logps/chosen": -700.0, "logps/rejected": -1184.0, "loss": 0.2221, "rewards/accuracies": 0.96875, "rewards/chosen": -5.03125, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 10240 }, { "epoch": 0.7092934745000345, "grad_norm": 19.544882560579584, "learning_rate": 1.1805837053523088e-07, "logits/chosen": -2.96875, "logits/rejected": -3.34375, "logps/chosen": -652.0, "logps/rejected": -1120.0, "loss": 0.1762, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.65625, "rewards/margins": 5.0, "rewards/rejected": -9.6875, "step": 10250 }, { "epoch": 0.7099854681336931, "grad_norm": 31.807182789549227, "learning_rate": 1.1754579257493935e-07, "logits/chosen": -2.921875, "logits/rejected": -3.203125, "logps/chosen": -624.0, "logps/rejected": -1168.0, "loss": 0.237, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.5625, "rewards/margins": 5.53125, "rewards/rejected": -10.125, "step": 10260 }, { "epoch": 0.7106774617673517, "grad_norm": 27.180149542242496, "learning_rate": 1.1703398755250605e-07, "logits/chosen": -2.828125, "logits/rejected": -3.25, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.2069, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.65625, "rewards/margins": 5.25, "rewards/rejected": -9.9375, "step": 10270 }, { "epoch": 0.7113694554010103, "grad_norm": 26.354798774640464, "learning_rate": 1.165229584545743e-07, "logits/chosen": -2.875, "logits/rejected": -3.296875, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.2471, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 5.34375, "rewards/rejected": -10.0, "step": 10280 }, { "epoch": 0.7120614490346688, "grad_norm": 23.462670810798567, "learning_rate": 1.1601270826325954e-07, "logits/chosen": -2.71875, "logits/rejected": -2.96875, "logps/chosen": -672.0, "logps/rejected": -1200.0, "loss": 0.2414, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.4375, "rewards/rejected": -10.3125, "step": 10290 }, { "epoch": 0.7127534426683274, "grad_norm": 18.04773550936, "learning_rate": 1.1550323995613201e-07, "logits/chosen": -2.90625, "logits/rejected": -3.234375, "logps/chosen": -632.0, "logps/rejected": -1080.0, "loss": 0.224, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.625, "rewards/rejected": -9.3125, "step": 10300 }, { "epoch": 0.713445436301986, "grad_norm": 23.68428449055756, "learning_rate": 1.1499455650619882e-07, "logits/chosen": -2.84375, "logits/rejected": -2.96875, "logps/chosen": -664.0, "logps/rejected": -1096.0, "loss": 0.2441, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.90625, "rewards/margins": 4.46875, "rewards/rejected": -9.375, "step": 10310 }, { "epoch": 0.7141374299356446, "grad_norm": 27.924185665421845, "learning_rate": 1.1448666088188763e-07, "logits/chosen": -2.875, "logits/rejected": -3.03125, "logps/chosen": -640.0, "logps/rejected": -1184.0, "loss": 0.2183, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.46875, "rewards/rejected": -10.125, "step": 10320 }, { "epoch": 0.7148294235693031, "grad_norm": 61.0175456007377, "learning_rate": 1.1397955604702842e-07, "logits/chosen": -2.8125, "logits/rejected": -3.125, "logps/chosen": -640.0, "logps/rejected": -1128.0, "loss": 0.2753, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 4.9375, "rewards/rejected": -9.625, "step": 10330 }, { "epoch": 0.7155214172029617, "grad_norm": 19.473849884545487, "learning_rate": 1.1347324496083657e-07, "logits/chosen": -2.984375, "logits/rejected": -3.234375, "logps/chosen": -676.0, "logps/rejected": -1144.0, "loss": 0.2174, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 4.8125, "rewards/rejected": -9.875, "step": 10340 }, { "epoch": 0.7162134108366203, "grad_norm": 20.219880296676372, "learning_rate": 1.1296773057789529e-07, "logits/chosen": -2.90625, "logits/rejected": -3.25, "logps/chosen": -644.0, "logps/rejected": -1088.0, "loss": 0.2061, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.53125, "rewards/rejected": -9.3125, "step": 10350 }, { "epoch": 0.7169054044702788, "grad_norm": 24.395436008489117, "learning_rate": 1.1246301584813911e-07, "logits/chosen": -2.890625, "logits/rejected": -3.125, "logps/chosen": -684.0, "logps/rejected": -1144.0, "loss": 0.2218, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 4.78125, "rewards/rejected": -9.8125, "step": 10360 }, { "epoch": 0.7175973981039374, "grad_norm": 18.34520296148196, "learning_rate": 1.1195910371683586e-07, "logits/chosen": -2.9375, "logits/rejected": -3.125, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.191, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 10370 }, { "epoch": 0.718289391737596, "grad_norm": 23.958077992285485, "learning_rate": 1.1145599712456969e-07, "logits/chosen": -2.78125, "logits/rejected": -3.046875, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.228, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.0, "rewards/rejected": -9.8125, "step": 10380 }, { "epoch": 0.7189813853712546, "grad_norm": 11.351911003939783, "learning_rate": 1.1095369900722421e-07, "logits/chosen": -2.96875, "logits/rejected": -3.15625, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.2216, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 4.8125, "rewards/rejected": -9.6875, "step": 10390 }, { "epoch": 0.7196733790049131, "grad_norm": 19.45688597665701, "learning_rate": 1.1045221229596532e-07, "logits/chosen": -2.90625, "logits/rejected": -3.21875, "logps/chosen": -680.0, "logps/rejected": -1136.0, "loss": 0.252, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.125, "rewards/margins": 4.6875, "rewards/rejected": -9.8125, "step": 10400 }, { "epoch": 0.7203653726385717, "grad_norm": 26.867791838728024, "learning_rate": 1.0995153991722359e-07, "logits/chosen": -3.09375, "logits/rejected": -3.359375, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.2273, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.1875, "rewards/rejected": -10.1875, "step": 10410 }, { "epoch": 0.7210573662722303, "grad_norm": 29.454759064333473, "learning_rate": 1.0945168479267772e-07, "logits/chosen": -2.875, "logits/rejected": -3.421875, "logps/chosen": -648.0, "logps/rejected": -1192.0, "loss": 0.2382, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.59375, "rewards/rejected": -10.25, "step": 10420 }, { "epoch": 0.7217493599058888, "grad_norm": 23.122548226698413, "learning_rate": 1.0895264983923738e-07, "logits/chosen": -2.9375, "logits/rejected": -3.125, "logps/chosen": -672.0, "logps/rejected": -1232.0, "loss": 0.2319, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.09375, "rewards/margins": 5.65625, "rewards/rejected": -10.75, "step": 10430 }, { "epoch": 0.7224413535395474, "grad_norm": 20.421420929328725, "learning_rate": 1.08454437969026e-07, "logits/chosen": -2.796875, "logits/rejected": -3.203125, "logps/chosen": -644.0, "logps/rejected": -1128.0, "loss": 0.2357, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.03125, "rewards/rejected": -9.75, "step": 10440 }, { "epoch": 0.723133347173206, "grad_norm": 14.773617549055103, "learning_rate": 1.0795705208936398e-07, "logits/chosen": -2.890625, "logits/rejected": -3.234375, "logps/chosen": -644.0, "logps/rejected": -1152.0, "loss": 0.2261, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 5.15625, "rewards/rejected": -9.875, "step": 10450 }, { "epoch": 0.7238253408068646, "grad_norm": 23.685164515761105, "learning_rate": 1.0746049510275163e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0625, "logps/chosen": -688.0, "logps/rejected": -1160.0, "loss": 0.2789, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 4.8125, "rewards/rejected": -9.875, "step": 10460 }, { "epoch": 0.7245173344405231, "grad_norm": 18.940116904788937, "learning_rate": 1.0696476990685222e-07, "logits/chosen": -2.890625, "logits/rejected": -3.03125, "logps/chosen": -616.0, "logps/rejected": -1128.0, "loss": 0.2721, "rewards/accuracies": 0.9375, "rewards/chosen": -4.59375, "rewards/margins": 5.125, "rewards/rejected": -9.6875, "step": 10470 }, { "epoch": 0.7252093280741817, "grad_norm": 30.723848730375998, "learning_rate": 1.0646987939447507e-07, "logits/chosen": -2.78125, "logits/rejected": -3.015625, "logps/chosen": -700.0, "logps/rejected": -1160.0, "loss": 0.2347, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 4.78125, "rewards/rejected": -9.9375, "step": 10480 }, { "epoch": 0.7259013217078403, "grad_norm": 20.201651492531965, "learning_rate": 1.059758264535589e-07, "logits/chosen": -2.84375, "logits/rejected": -3.046875, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.2456, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.09375, "rewards/rejected": -9.9375, "step": 10490 }, { "epoch": 0.7265933153414988, "grad_norm": 27.281212396207344, "learning_rate": 1.0548261396715432e-07, "logits/chosen": -2.921875, "logits/rejected": -3.15625, "logps/chosen": -684.0, "logps/rejected": -1216.0, "loss": 0.2184, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.375, "rewards/rejected": -10.3125, "step": 10500 }, { "epoch": 0.7272853089751574, "grad_norm": 18.22556457291366, "learning_rate": 1.0499024481340799e-07, "logits/chosen": -2.9375, "logits/rejected": -3.21875, "logps/chosen": -636.0, "logps/rejected": -1144.0, "loss": 0.2268, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.75, "rewards/margins": 5.25, "rewards/rejected": -10.0, "step": 10510 }, { "epoch": 0.727977302608816, "grad_norm": 39.78547918873813, "learning_rate": 1.0449872186554506e-07, "logits/chosen": -2.9375, "logits/rejected": -3.1875, "logps/chosen": -636.0, "logps/rejected": -1136.0, "loss": 0.2389, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.0625, "rewards/rejected": -9.75, "step": 10520 }, { "epoch": 0.7286692962424746, "grad_norm": 29.136288668336608, "learning_rate": 1.040080479918525e-07, "logits/chosen": -2.9375, "logits/rejected": -3.265625, "logps/chosen": -692.0, "logps/rejected": -1112.0, "loss": 0.2573, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.1875, "rewards/margins": 4.4375, "rewards/rejected": -9.625, "step": 10530 }, { "epoch": 0.7293612898761331, "grad_norm": 23.008009718588934, "learning_rate": 1.0351822605566257e-07, "logits/chosen": -2.890625, "logits/rejected": -3.15625, "logps/chosen": -644.0, "logps/rejected": -1128.0, "loss": 0.2599, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.90625, "rewards/rejected": -9.6875, "step": 10540 }, { "epoch": 0.7300532835097917, "grad_norm": 30.038019632231347, "learning_rate": 1.030292589153364e-07, "logits/chosen": -2.890625, "logits/rejected": -3.0, "logps/chosen": -668.0, "logps/rejected": -1208.0, "loss": 0.2176, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.28125, "rewards/rejected": -10.125, "step": 10550 }, { "epoch": 0.7307452771434503, "grad_norm": 25.642699422143505, "learning_rate": 1.0254114942424635e-07, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -688.0, "logps/rejected": -1160.0, "loss": 0.2587, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.0, "rewards/rejected": -9.9375, "step": 10560 }, { "epoch": 0.7314372707771088, "grad_norm": 22.867273812037876, "learning_rate": 1.020539004307604e-07, "logits/chosen": -2.9375, "logits/rejected": -3.34375, "logps/chosen": -660.0, "logps/rejected": -1112.0, "loss": 0.2065, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.8125, "rewards/rejected": -9.5, "step": 10570 }, { "epoch": 0.7321292644107674, "grad_norm": 27.072050658595373, "learning_rate": 1.015675147782249e-07, "logits/chosen": -2.9375, "logits/rejected": -3.359375, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.1828, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 10580 }, { "epoch": 0.732821258044426, "grad_norm": 18.994556613133806, "learning_rate": 1.0108199530494823e-07, "logits/chosen": -2.875, "logits/rejected": -3.40625, "logps/chosen": -624.0, "logps/rejected": -1144.0, "loss": 0.2257, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.4375, "rewards/margins": 5.3125, "rewards/rejected": -9.75, "step": 10590 }, { "epoch": 0.7335132516780846, "grad_norm": 15.342962153493218, "learning_rate": 1.0059734484418416e-07, "logits/chosen": -3.046875, "logits/rejected": -3.453125, "logps/chosen": -648.0, "logps/rejected": -1096.0, "loss": 0.2016, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 4.6875, "rewards/rejected": -9.5, "step": 10600 }, { "epoch": 0.7342052453117431, "grad_norm": 33.02810182111989, "learning_rate": 1.0011356622411532e-07, "logits/chosen": -2.859375, "logits/rejected": -3.1875, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2209, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.25, "rewards/rejected": -10.0, "step": 10610 }, { "epoch": 0.7348972389454017, "grad_norm": 17.591098070066074, "learning_rate": 9.963066226783673e-08, "logits/chosen": -3.046875, "logits/rejected": -3.296875, "logps/chosen": -640.0, "logps/rejected": -1160.0, "loss": 0.1965, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 5.25, "rewards/rejected": -10.0, "step": 10620 }, { "epoch": 0.7355892325790603, "grad_norm": 19.94921742693459, "learning_rate": 9.914863579333921e-08, "logits/chosen": -2.96875, "logits/rejected": -3.34375, "logps/chosen": -660.0, "logps/rejected": -1112.0, "loss": 0.2045, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.75, "rewards/rejected": -9.5625, "step": 10630 }, { "epoch": 0.7362812262127189, "grad_norm": 30.756967264620027, "learning_rate": 9.866748961349325e-08, "logits/chosen": -2.96875, "logits/rejected": -3.15625, "logps/chosen": -656.0, "logps/rejected": -1176.0, "loss": 0.2307, "rewards/accuracies": 0.9375, "rewards/chosen": -4.8125, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 10640 }, { "epoch": 0.7369732198463774, "grad_norm": 24.632928677871302, "learning_rate": 9.818722653603193e-08, "logits/chosen": -2.890625, "logits/rejected": -3.1875, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.2036, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.28125, "rewards/rejected": -10.25, "step": 10650 }, { "epoch": 0.737665213480036, "grad_norm": 15.904343775002207, "learning_rate": 9.770784936353554e-08, "logits/chosen": -2.875, "logits/rejected": -3.3125, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.1862, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.65625, "rewards/margins": 5.3125, "rewards/rejected": -10.0, "step": 10660 }, { "epoch": 0.7383572071136946, "grad_norm": 24.079322018769556, "learning_rate": 9.722936089341435e-08, "logits/chosen": -2.875, "logits/rejected": -3.09375, "logps/chosen": -624.0, "logps/rejected": -1144.0, "loss": 0.2165, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.59375, "rewards/margins": 5.25, "rewards/rejected": -9.875, "step": 10670 }, { "epoch": 0.7390492007473531, "grad_norm": 22.20828185978294, "learning_rate": 9.67517639178925e-08, "logits/chosen": -2.828125, "logits/rejected": -3.1875, "logps/chosen": -676.0, "logps/rejected": -1192.0, "loss": 0.2164, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.4375, "rewards/rejected": -10.25, "step": 10680 }, { "epoch": 0.7397411943810117, "grad_norm": 28.789537777364153, "learning_rate": 9.627506122399192e-08, "logits/chosen": -2.90625, "logits/rejected": -3.3125, "logps/chosen": -636.0, "logps/rejected": -1120.0, "loss": 0.2319, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.1875, "rewards/rejected": -9.75, "step": 10690 }, { "epoch": 0.7404331880146703, "grad_norm": 25.723553643489062, "learning_rate": 9.579925559351623e-08, "logits/chosen": -2.875, "logits/rejected": -2.890625, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.201, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.21875, "rewards/rejected": -10.25, "step": 10700 }, { "epoch": 0.7411251816483289, "grad_norm": 19.417680324031902, "learning_rate": 9.532434980303372e-08, "logits/chosen": -2.953125, "logits/rejected": -3.203125, "logps/chosen": -644.0, "logps/rejected": -1184.0, "loss": 0.1815, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.625, "rewards/margins": 5.46875, "rewards/rejected": -10.125, "step": 10710 }, { "epoch": 0.7418171752819874, "grad_norm": 22.77840189061087, "learning_rate": 9.485034662386199e-08, "logits/chosen": -2.9375, "logits/rejected": -2.96875, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.2135, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.875, "rewards/margins": 5.53125, "rewards/rejected": -10.375, "step": 10720 }, { "epoch": 0.742509168915646, "grad_norm": 19.131555128004997, "learning_rate": 9.437724882205134e-08, "logits/chosen": -2.984375, "logits/rejected": -3.0625, "logps/chosen": -640.0, "logps/rejected": -1184.0, "loss": 0.2107, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.40625, "rewards/rejected": -10.1875, "step": 10730 }, { "epoch": 0.7432011625493046, "grad_norm": 22.80735992730979, "learning_rate": 9.390505915836871e-08, "logits/chosen": -2.921875, "logits/rejected": -3.234375, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.247, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 4.90625, "rewards/rejected": -9.6875, "step": 10740 }, { "epoch": 0.7438931561829631, "grad_norm": 31.892413971452225, "learning_rate": 9.34337803882817e-08, "logits/chosen": -2.921875, "logits/rejected": -3.109375, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2389, "rewards/accuracies": 0.9375, "rewards/chosen": -4.6875, "rewards/margins": 5.46875, "rewards/rejected": -10.125, "step": 10750 }, { "epoch": 0.7445851498166217, "grad_norm": 16.41002609296558, "learning_rate": 9.296341526194226e-08, "logits/chosen": -2.96875, "logits/rejected": -3.109375, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2254, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 10760 }, { "epoch": 0.7452771434502803, "grad_norm": 18.731264780669097, "learning_rate": 9.249396652417086e-08, "logits/chosen": -2.765625, "logits/rejected": -3.171875, "logps/chosen": -700.0, "logps/rejected": -1176.0, "loss": 0.2218, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 10770 }, { "epoch": 0.7459691370839389, "grad_norm": 24.420562309210737, "learning_rate": 9.202543691444025e-08, "logits/chosen": -2.90625, "logits/rejected": -3.203125, "logps/chosen": -676.0, "logps/rejected": -1184.0, "loss": 0.205, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 5.25, "rewards/rejected": -10.25, "step": 10780 }, { "epoch": 0.7466611307175974, "grad_norm": 23.79840721098353, "learning_rate": 9.15578291668598e-08, "logits/chosen": -2.953125, "logits/rejected": -3.28125, "logps/chosen": -676.0, "logps/rejected": -1152.0, "loss": 0.2344, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.09375, "rewards/rejected": -9.9375, "step": 10790 }, { "epoch": 0.747353124351256, "grad_norm": 26.06539513328623, "learning_rate": 9.109114601015899e-08, "logits/chosen": -2.90625, "logits/rejected": -3.28125, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2406, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84375, "rewards/margins": 5.0625, "rewards/rejected": -9.875, "step": 10800 }, { "epoch": 0.7480451179849146, "grad_norm": 18.367203239839334, "learning_rate": 9.06253901676723e-08, "logits/chosen": -2.96875, "logits/rejected": -3.359375, "logps/chosen": -640.0, "logps/rejected": -1184.0, "loss": 0.1959, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.375, "rewards/rejected": -10.1875, "step": 10810 }, { "epoch": 0.7487371116185731, "grad_norm": 22.929188528865826, "learning_rate": 9.016056435732261e-08, "logits/chosen": -2.84375, "logits/rejected": -3.203125, "logps/chosen": -640.0, "logps/rejected": -1160.0, "loss": 0.2106, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.59375, "rewards/margins": 5.3125, "rewards/rejected": -9.9375, "step": 10820 }, { "epoch": 0.7494291052522317, "grad_norm": 26.53452797661766, "learning_rate": 8.969667129160546e-08, "logits/chosen": -2.828125, "logits/rejected": -3.15625, "logps/chosen": -700.0, "logps/rejected": -1192.0, "loss": 0.2387, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 5.09375, "rewards/rejected": -10.1875, "step": 10830 }, { "epoch": 0.7501210988858903, "grad_norm": 22.977965758002366, "learning_rate": 8.923371367757348e-08, "logits/chosen": -2.90625, "logits/rejected": -3.171875, "logps/chosen": -676.0, "logps/rejected": -1160.0, "loss": 0.242, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.96875, "rewards/rejected": -10.0625, "step": 10840 }, { "epoch": 0.7508130925195489, "grad_norm": 23.58481628801638, "learning_rate": 8.877169421682076e-08, "logits/chosen": -2.859375, "logits/rejected": -3.0, "logps/chosen": -720.0, "logps/rejected": -1192.0, "loss": 0.2509, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.25, "rewards/margins": 4.90625, "rewards/rejected": -10.1875, "step": 10850 }, { "epoch": 0.7515050861532074, "grad_norm": 13.597547320416131, "learning_rate": 8.831061560546621e-08, "logits/chosen": -2.796875, "logits/rejected": -2.90625, "logps/chosen": -676.0, "logps/rejected": -1192.0, "loss": 0.1988, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 10860 }, { "epoch": 0.752197079786866, "grad_norm": 16.67220159625327, "learning_rate": 8.785048053413885e-08, "logits/chosen": -2.984375, "logits/rejected": -3.296875, "logps/chosen": -656.0, "logps/rejected": -1112.0, "loss": 0.2224, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.75, "rewards/margins": 4.8125, "rewards/rejected": -9.5625, "step": 10870 }, { "epoch": 0.7528890734205246, "grad_norm": 23.410784880169604, "learning_rate": 8.739129168796142e-08, "logits/chosen": -2.90625, "logits/rejected": -3.25, "logps/chosen": -680.0, "logps/rejected": -1128.0, "loss": 0.2561, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.625, "rewards/rejected": -9.75, "step": 10880 }, { "epoch": 0.7535810670541832, "grad_norm": 30.110191711317984, "learning_rate": 8.693305174653512e-08, "logits/chosen": -2.96875, "logits/rejected": -3.28125, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2354, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.40625, "rewards/rejected": -10.1875, "step": 10890 }, { "epoch": 0.7542730606878417, "grad_norm": 27.383593406246362, "learning_rate": 8.64757633839237e-08, "logits/chosen": -2.984375, "logits/rejected": -3.234375, "logps/chosen": -640.0, "logps/rejected": -1136.0, "loss": 0.2421, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71875, "rewards/margins": 5.09375, "rewards/rejected": -9.8125, "step": 10900 }, { "epoch": 0.7549650543215003, "grad_norm": 29.544092071906622, "learning_rate": 8.6019429268638e-08, "logits/chosen": -2.84375, "logits/rejected": -3.234375, "logps/chosen": -688.0, "logps/rejected": -1184.0, "loss": 0.2037, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.3125, "rewards/rejected": -10.125, "step": 10910 }, { "epoch": 0.7556570479551589, "grad_norm": 14.99833206497012, "learning_rate": 8.556405206362035e-08, "logits/chosen": -2.859375, "logits/rejected": -3.046875, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2034, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.0, "rewards/rejected": -9.875, "step": 10920 }, { "epoch": 0.7563490415888173, "grad_norm": 25.30640234918779, "learning_rate": 8.510963442622899e-08, "logits/chosen": -2.734375, "logits/rejected": -3.046875, "logps/chosen": -676.0, "logps/rejected": -1176.0, "loss": 0.2477, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 10930 }, { "epoch": 0.7570410352224759, "grad_norm": 18.041017142990455, "learning_rate": 8.465617900822267e-08, "logits/chosen": -2.9375, "logits/rejected": -3.328125, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.2131, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.40625, "rewards/rejected": -10.1875, "step": 10940 }, { "epoch": 0.7577330288561345, "grad_norm": 16.673734000258964, "learning_rate": 8.420368845574483e-08, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -632.0, "logps/rejected": -1176.0, "loss": 0.1939, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.625, "rewards/margins": 5.34375, "rewards/rejected": -10.0, "step": 10950 }, { "epoch": 0.758425022489793, "grad_norm": 25.953865849011876, "learning_rate": 8.375216540930886e-08, "logits/chosen": -2.90625, "logits/rejected": -3.15625, "logps/chosen": -640.0, "logps/rejected": -1160.0, "loss": 0.2029, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.53125, "rewards/margins": 5.5, "rewards/rejected": -10.0, "step": 10960 }, { "epoch": 0.7591170161234516, "grad_norm": 26.720654067595962, "learning_rate": 8.330161250378201e-08, "logits/chosen": -2.890625, "logits/rejected": -3.296875, "logps/chosen": -696.0, "logps/rejected": -1144.0, "loss": 0.2024, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.84375, "rewards/rejected": -9.9375, "step": 10970 }, { "epoch": 0.7598090097571102, "grad_norm": 22.43206518137602, "learning_rate": 8.285203236837019e-08, "logits/chosen": -2.875, "logits/rejected": -3.34375, "logps/chosen": -624.0, "logps/rejected": -1160.0, "loss": 0.2271, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.5, "rewards/margins": 5.4375, "rewards/rejected": -10.0, "step": 10980 }, { "epoch": 0.7605010033907688, "grad_norm": 12.338595371482857, "learning_rate": 8.240342762660273e-08, "logits/chosen": -2.859375, "logits/rejected": -3.296875, "logps/chosen": -632.0, "logps/rejected": -1168.0, "loss": 0.2007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.59375, "rewards/margins": 5.5625, "rewards/rejected": -10.125, "step": 10990 }, { "epoch": 0.7611929970244273, "grad_norm": 44.807568195204425, "learning_rate": 8.195580089631732e-08, "logits/chosen": -2.734375, "logits/rejected": -2.890625, "logps/chosen": -652.0, "logps/rejected": -1208.0, "loss": 0.2192, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.5, "rewards/rejected": -10.25, "step": 11000 }, { "epoch": 0.7611929970244273, "eval_logits/chosen": -2.90625, "eval_logits/rejected": -3.15625, "eval_logps/chosen": -704.0, "eval_logps/rejected": -1112.0, "eval_loss": 0.2287333905696869, "eval_rewards/accuracies": 0.8979725241661072, "eval_rewards/chosen": -5.15625, "eval_rewards/margins": 4.3125, "eval_rewards/rejected": -9.5, "eval_runtime": 2939.864, "eval_samples_per_second": 33.278, "eval_steps_per_second": 0.52, "step": 11000 }, { "epoch": 0.7618849906580859, "grad_norm": 23.42370340294384, "learning_rate": 8.150915478964421e-08, "logits/chosen": -2.90625, "logits/rejected": -3.328125, "logps/chosen": -648.0, "logps/rejected": -1136.0, "loss": 0.2055, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 5.15625, "rewards/rejected": -9.875, "step": 11010 }, { "epoch": 0.7625769842917445, "grad_norm": 30.985069247367857, "learning_rate": 8.106349191299111e-08, "logits/chosen": -2.90625, "logits/rejected": -3.046875, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.2329, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 11020 }, { "epoch": 0.763268977925403, "grad_norm": 30.20418288401941, "learning_rate": 8.061881486702818e-08, "logits/chosen": -2.875, "logits/rejected": -3.28125, "logps/chosen": -692.0, "logps/rejected": -1152.0, "loss": 0.245, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.96875, "rewards/rejected": -9.875, "step": 11030 }, { "epoch": 0.7639609715590616, "grad_norm": 11.353721774436876, "learning_rate": 8.017512624667305e-08, "logits/chosen": -2.875, "logits/rejected": -3.328125, "logps/chosen": -644.0, "logps/rejected": -1176.0, "loss": 0.1986, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.40625, "rewards/rejected": -10.25, "step": 11040 }, { "epoch": 0.7646529651927202, "grad_norm": 17.88191814653243, "learning_rate": 7.973242864107488e-08, "logits/chosen": -2.859375, "logits/rejected": -2.953125, "logps/chosen": -668.0, "logps/rejected": -1184.0, "loss": 0.2049, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.625, "rewards/margins": 5.34375, "rewards/rejected": -9.9375, "step": 11050 }, { "epoch": 0.7653449588263788, "grad_norm": 27.638632572639917, "learning_rate": 7.929072463360007e-08, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -628.0, "logps/rejected": -1144.0, "loss": 0.2514, "rewards/accuracies": 0.9375, "rewards/chosen": -4.625, "rewards/margins": 5.1875, "rewards/rejected": -9.8125, "step": 11060 }, { "epoch": 0.7660369524600373, "grad_norm": 22.620616456928847, "learning_rate": 7.885001680181679e-08, "logits/chosen": -2.921875, "logits/rejected": -3.171875, "logps/chosen": -644.0, "logps/rejected": -1120.0, "loss": 0.2373, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 5.03125, "rewards/rejected": -9.625, "step": 11070 }, { "epoch": 0.7667289460936959, "grad_norm": 23.154646171559936, "learning_rate": 7.841030771748005e-08, "logits/chosen": -2.96875, "logits/rejected": -3.328125, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2249, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 11080 }, { "epoch": 0.7674209397273545, "grad_norm": 19.57341947103586, "learning_rate": 7.797159994651662e-08, "logits/chosen": -2.96875, "logits/rejected": -3.203125, "logps/chosen": -660.0, "logps/rejected": -1152.0, "loss": 0.2258, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 5.09375, "rewards/rejected": -10.0625, "step": 11090 }, { "epoch": 0.768112933361013, "grad_norm": 11.219680863362912, "learning_rate": 7.753389604901006e-08, "logits/chosen": -2.765625, "logits/rejected": -3.125, "logps/chosen": -668.0, "logps/rejected": -1160.0, "loss": 0.2386, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.09375, "rewards/rejected": -10.0625, "step": 11100 }, { "epoch": 0.7688049269946716, "grad_norm": 20.324005481901718, "learning_rate": 7.709719857918589e-08, "logits/chosen": -2.828125, "logits/rejected": -3.015625, "logps/chosen": -668.0, "logps/rejected": -1184.0, "loss": 0.192, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.25, "rewards/rejected": -10.0625, "step": 11110 }, { "epoch": 0.7694969206283302, "grad_norm": 18.95994532308068, "learning_rate": 7.666151008539659e-08, "logits/chosen": -2.875, "logits/rejected": -3.015625, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2148, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 4.875, "rewards/rejected": -9.875, "step": 11120 }, { "epoch": 0.7701889142619888, "grad_norm": 16.136996722873437, "learning_rate": 7.622683311010683e-08, "logits/chosen": -2.921875, "logits/rejected": -3.140625, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.2105, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 11130 }, { "epoch": 0.7708809078956473, "grad_norm": 26.232193939401935, "learning_rate": 7.579317018987821e-08, "logits/chosen": -2.8125, "logits/rejected": -3.0, "logps/chosen": -684.0, "logps/rejected": -1136.0, "loss": 0.2055, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 4.75, "rewards/rejected": -9.625, "step": 11140 }, { "epoch": 0.7715729015293059, "grad_norm": 17.991012677748778, "learning_rate": 7.536052385535532e-08, "logits/chosen": -2.96875, "logits/rejected": -3.109375, "logps/chosen": -648.0, "logps/rejected": -1176.0, "loss": 0.2372, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.21875, "rewards/rejected": -10.0, "step": 11150 }, { "epoch": 0.7722648951629645, "grad_norm": 34.09372892303167, "learning_rate": 7.492889663125015e-08, "logits/chosen": -2.90625, "logits/rejected": -3.203125, "logps/chosen": -640.0, "logps/rejected": -1160.0, "loss": 0.2132, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.3125, "rewards/rejected": -10.0, "step": 11160 }, { "epoch": 0.772956888796623, "grad_norm": 19.84581993291925, "learning_rate": 7.44982910363276e-08, "logits/chosen": -2.953125, "logits/rejected": -3.171875, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2297, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.625, "rewards/rejected": -9.625, "step": 11170 }, { "epoch": 0.7736488824302816, "grad_norm": 16.333468797050696, "learning_rate": 7.40687095833909e-08, "logits/chosen": -2.828125, "logits/rejected": -3.03125, "logps/chosen": -700.0, "logps/rejected": -1200.0, "loss": 0.2328, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 5.0625, "rewards/rejected": -10.25, "step": 11180 }, { "epoch": 0.7743408760639402, "grad_norm": 32.06876153451914, "learning_rate": 7.36401547792672e-08, "logits/chosen": -2.984375, "logits/rejected": -3.15625, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.201, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.28125, "rewards/rejected": -10.125, "step": 11190 }, { "epoch": 0.7750328696975988, "grad_norm": 24.94386859595336, "learning_rate": 7.321262912479209e-08, "logits/chosen": -2.890625, "logits/rejected": -3.265625, "logps/chosen": -672.0, "logps/rejected": -1176.0, "loss": 0.2045, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 5.15625, "rewards/rejected": -10.1875, "step": 11200 }, { "epoch": 0.7757248633312573, "grad_norm": 25.12577704903191, "learning_rate": 7.278613511479593e-08, "logits/chosen": -2.90625, "logits/rejected": -3.234375, "logps/chosen": -656.0, "logps/rejected": -1136.0, "loss": 0.2579, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 11210 }, { "epoch": 0.7764168569649159, "grad_norm": 20.465619142956093, "learning_rate": 7.23606752380888e-08, "logits/chosen": -2.921875, "logits/rejected": -3.203125, "logps/chosen": -632.0, "logps/rejected": -1152.0, "loss": 0.2201, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 5.4375, "rewards/rejected": -10.0, "step": 11220 }, { "epoch": 0.7771088505985745, "grad_norm": 33.128853196082666, "learning_rate": 7.19362519774461e-08, "logits/chosen": -2.875, "logits/rejected": -3.0625, "logps/chosen": -636.0, "logps/rejected": -1136.0, "loss": 0.2435, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.65625, "rewards/margins": 5.0625, "rewards/rejected": -9.6875, "step": 11230 }, { "epoch": 0.7778008442322331, "grad_norm": 18.392518986440717, "learning_rate": 7.151286780959398e-08, "logits/chosen": -2.90625, "logits/rejected": -3.359375, "logps/chosen": -668.0, "logps/rejected": -1112.0, "loss": 0.2557, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.6875, "rewards/rejected": -9.625, "step": 11240 }, { "epoch": 0.7784928378658916, "grad_norm": 28.155511446531442, "learning_rate": 7.109052520519506e-08, "logits/chosen": -2.8125, "logits/rejected": -3.28125, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.1897, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.625, "rewards/margins": 5.59375, "rewards/rejected": -10.25, "step": 11250 }, { "epoch": 0.7791848314995502, "grad_norm": 23.65706671442202, "learning_rate": 7.066922662883384e-08, "logits/chosen": -2.84375, "logits/rejected": -3.046875, "logps/chosen": -660.0, "logps/rejected": -1232.0, "loss": 0.2605, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.78125, "rewards/rejected": -10.5, "step": 11260 }, { "epoch": 0.7798768251332088, "grad_norm": 18.38228462800009, "learning_rate": 7.024897453900238e-08, "logits/chosen": -2.84375, "logits/rejected": -3.09375, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.2409, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84375, "rewards/margins": 4.84375, "rewards/rejected": -9.6875, "step": 11270 }, { "epoch": 0.7805688187668673, "grad_norm": 23.45860932694325, "learning_rate": 6.982977138808604e-08, "logits/chosen": -2.875, "logits/rejected": -3.140625, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2354, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 5.03125, "rewards/rejected": -10.0625, "step": 11280 }, { "epoch": 0.7812608124005259, "grad_norm": 30.115036414122795, "learning_rate": 6.941161962234881e-08, "logits/chosen": -2.671875, "logits/rejected": -2.828125, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.1966, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 11290 }, { "epoch": 0.7819528060341845, "grad_norm": 25.1251840924652, "learning_rate": 6.899452168191974e-08, "logits/chosen": -2.96875, "logits/rejected": -3.1875, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2291, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.0625, "rewards/rejected": -9.875, "step": 11300 }, { "epoch": 0.7826447996678431, "grad_norm": 18.892387838917205, "learning_rate": 6.857848000077807e-08, "logits/chosen": -2.859375, "logits/rejected": -3.1875, "logps/chosen": -648.0, "logps/rejected": -1192.0, "loss": 0.1948, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.625, "rewards/margins": 5.6875, "rewards/rejected": -10.3125, "step": 11310 }, { "epoch": 0.7833367933015016, "grad_norm": 22.650593990033144, "learning_rate": 6.816349700673904e-08, "logits/chosen": -2.78125, "logits/rejected": -3.09375, "logps/chosen": -680.0, "logps/rejected": -1128.0, "loss": 0.2503, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0, "rewards/margins": 4.65625, "rewards/rejected": -9.625, "step": 11320 }, { "epoch": 0.7840287869351602, "grad_norm": 27.22037965568523, "learning_rate": 6.774957512144008e-08, "logits/chosen": -2.96875, "logits/rejected": -3.25, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2372, "rewards/accuracies": 0.9375, "rewards/chosen": -4.5625, "rewards/margins": 5.375, "rewards/rejected": -9.9375, "step": 11330 }, { "epoch": 0.7847207805688188, "grad_norm": 31.963712606899005, "learning_rate": 6.733671676032673e-08, "logits/chosen": -2.90625, "logits/rejected": -3.109375, "logps/chosen": -648.0, "logps/rejected": -1128.0, "loss": 0.2689, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.0, "rewards/rejected": -9.75, "step": 11340 }, { "epoch": 0.7854127742024773, "grad_norm": 18.426062536184183, "learning_rate": 6.692492433263791e-08, "logits/chosen": -2.796875, "logits/rejected": -2.859375, "logps/chosen": -672.0, "logps/rejected": -1160.0, "loss": 0.2223, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.0625, "rewards/rejected": -9.9375, "step": 11350 }, { "epoch": 0.7861047678361359, "grad_norm": 17.695131404996687, "learning_rate": 6.651420024139251e-08, "logits/chosen": -2.828125, "logits/rejected": -3.015625, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.2154, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.1875, "rewards/rejected": -9.875, "step": 11360 }, { "epoch": 0.7867967614697945, "grad_norm": 17.80848264015352, "learning_rate": 6.610454688337514e-08, "logits/chosen": -2.984375, "logits/rejected": -3.265625, "logps/chosen": -632.0, "logps/rejected": -1144.0, "loss": 0.2354, "rewards/accuracies": 0.9375, "rewards/chosen": -4.65625, "rewards/margins": 5.09375, "rewards/rejected": -9.75, "step": 11370 }, { "epoch": 0.7874887551034531, "grad_norm": 28.83158907722568, "learning_rate": 6.5695966649122e-08, "logits/chosen": -2.859375, "logits/rejected": -3.0, "logps/chosen": -620.0, "logps/rejected": -1176.0, "loss": 0.1929, "rewards/accuracies": 0.96875, "rewards/chosen": -4.375, "rewards/margins": 5.5625, "rewards/rejected": -9.9375, "step": 11380 }, { "epoch": 0.7881807487371116, "grad_norm": 15.715351001837972, "learning_rate": 6.528846192290723e-08, "logits/chosen": -2.953125, "logits/rejected": -3.15625, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.2327, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.1875, "rewards/rejected": -9.875, "step": 11390 }, { "epoch": 0.7888727423707702, "grad_norm": 26.057835178178173, "learning_rate": 6.488203508272871e-08, "logits/chosen": -2.96875, "logits/rejected": -3.375, "logps/chosen": -648.0, "logps/rejected": -1096.0, "loss": 0.2397, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 4.59375, "rewards/rejected": -9.4375, "step": 11400 }, { "epoch": 0.7895647360044288, "grad_norm": 22.663134342992528, "learning_rate": 6.44766885002944e-08, "logits/chosen": -2.890625, "logits/rejected": -3.25, "logps/chosen": -644.0, "logps/rejected": -1136.0, "loss": 0.2024, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 5.21875, "rewards/rejected": -9.875, "step": 11410 }, { "epoch": 0.7902567296380874, "grad_norm": 27.447499012812575, "learning_rate": 6.40724245410083e-08, "logits/chosen": -2.8125, "logits/rejected": -2.890625, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.252, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.96875, "rewards/rejected": -9.75, "step": 11420 }, { "epoch": 0.7909487232717459, "grad_norm": 34.587775357349, "learning_rate": 6.366924556395692e-08, "logits/chosen": -2.828125, "logits/rejected": -3.078125, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2174, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.15625, "rewards/rejected": -9.875, "step": 11430 }, { "epoch": 0.7916407169054045, "grad_norm": 17.32473587055759, "learning_rate": 6.326715392189497e-08, "logits/chosen": -2.875, "logits/rejected": -2.96875, "logps/chosen": -664.0, "logps/rejected": -1192.0, "loss": 0.2148, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.4375, "rewards/rejected": -10.25, "step": 11440 }, { "epoch": 0.7923327105390631, "grad_norm": 12.679402681839683, "learning_rate": 6.286615196123252e-08, "logits/chosen": -2.84375, "logits/rejected": -3.046875, "logps/chosen": -692.0, "logps/rejected": -1152.0, "loss": 0.2019, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.84375, "rewards/rejected": -9.875, "step": 11450 }, { "epoch": 0.7930247041727216, "grad_norm": 30.779094547472138, "learning_rate": 6.246624202202053e-08, "logits/chosen": -2.890625, "logits/rejected": -3.046875, "logps/chosen": -632.0, "logps/rejected": -1144.0, "loss": 0.2089, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 5.1875, "rewards/rejected": -9.75, "step": 11460 }, { "epoch": 0.7937166978063802, "grad_norm": 24.86619349786065, "learning_rate": 6.206742643793731e-08, "logits/chosen": -2.859375, "logits/rejected": -2.984375, "logps/chosen": -632.0, "logps/rejected": -1160.0, "loss": 0.2439, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 5.28125, "rewards/rejected": -9.875, "step": 11470 }, { "epoch": 0.7944086914400388, "grad_norm": 27.333732149510304, "learning_rate": 6.166970753627524e-08, "logits/chosen": -2.890625, "logits/rejected": -3.140625, "logps/chosen": -644.0, "logps/rejected": -1152.0, "loss": 0.2231, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 11480 }, { "epoch": 0.7951006850736974, "grad_norm": 40.599357908227255, "learning_rate": 6.127308763792716e-08, "logits/chosen": -2.75, "logits/rejected": -3.03125, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.2143, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.53125, "rewards/margins": 5.59375, "rewards/rejected": -10.125, "step": 11490 }, { "epoch": 0.7957926787073559, "grad_norm": 13.407925207579368, "learning_rate": 6.087756905737224e-08, "logits/chosen": -2.90625, "logits/rejected": -3.0, "logps/chosen": -676.0, "logps/rejected": -1128.0, "loss": 0.2197, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 4.75, "rewards/rejected": -9.6875, "step": 11500 }, { "epoch": 0.7964846723410145, "grad_norm": 21.499467139264958, "learning_rate": 6.048315410266324e-08, "logits/chosen": -2.859375, "logits/rejected": -3.015625, "logps/chosen": -640.0, "logps/rejected": -1152.0, "loss": 0.2205, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.65625, "rewards/margins": 5.125, "rewards/rejected": -9.8125, "step": 11510 }, { "epoch": 0.7971766659746731, "grad_norm": 22.865533042248412, "learning_rate": 6.008984507541257e-08, "logits/chosen": -3.03125, "logits/rejected": -3.03125, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2425, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 4.875, "rewards/rejected": -9.625, "step": 11520 }, { "epoch": 0.7978686596083316, "grad_norm": 21.395276514020914, "learning_rate": 5.9697644270779e-08, "logits/chosen": -2.84375, "logits/rejected": -2.78125, "logps/chosen": -636.0, "logps/rejected": -1144.0, "loss": 0.2434, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.03125, "rewards/rejected": -9.8125, "step": 11530 }, { "epoch": 0.7985606532419902, "grad_norm": 31.314183315459804, "learning_rate": 5.930655397745429e-08, "logits/chosen": -2.890625, "logits/rejected": -3.21875, "logps/chosen": -668.0, "logps/rejected": -1112.0, "loss": 0.2707, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.53125, "rewards/rejected": -9.4375, "step": 11540 }, { "epoch": 0.7992526468756488, "grad_norm": 26.263159637317916, "learning_rate": 5.891657647764975e-08, "logits/chosen": -2.875, "logits/rejected": -2.96875, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.2365, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.28125, "rewards/rejected": -9.9375, "step": 11550 }, { "epoch": 0.7999446405093074, "grad_norm": 16.495413753793443, "learning_rate": 5.8527714047083036e-08, "logits/chosen": -2.921875, "logits/rejected": -3.3125, "logps/chosen": -620.0, "logps/rejected": -1136.0, "loss": 0.2511, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 5.15625, "rewards/rejected": -9.6875, "step": 11560 }, { "epoch": 0.8006366341429659, "grad_norm": 22.00040738405666, "learning_rate": 5.813996895496473e-08, "logits/chosen": -2.984375, "logits/rejected": -3.21875, "logps/chosen": -636.0, "logps/rejected": -1112.0, "loss": 0.234, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 4.84375, "rewards/rejected": -9.5625, "step": 11570 }, { "epoch": 0.8013286277766245, "grad_norm": 24.592336260681684, "learning_rate": 5.775334346398531e-08, "logits/chosen": -2.984375, "logits/rejected": -3.03125, "logps/chosen": -672.0, "logps/rejected": -1160.0, "loss": 0.1844, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.8125, "rewards/rejected": -9.875, "step": 11580 }, { "epoch": 0.8020206214102831, "grad_norm": 29.61335302010054, "learning_rate": 5.73678398303015e-08, "logits/chosen": -2.84375, "logits/rejected": -3.125, "logps/chosen": -652.0, "logps/rejected": -1200.0, "loss": 0.2068, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 5.65625, "rewards/rejected": -10.3125, "step": 11590 }, { "epoch": 0.8027126150439416, "grad_norm": 25.737903968981342, "learning_rate": 5.6983460303523804e-08, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.1998, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 5.375, "rewards/rejected": -10.0, "step": 11600 }, { "epoch": 0.8034046086776002, "grad_norm": 12.258804874146017, "learning_rate": 5.6600207126702855e-08, "logits/chosen": -2.875, "logits/rejected": -3.078125, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.2268, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 5.21875, "rewards/rejected": -9.75, "step": 11610 }, { "epoch": 0.8040966023112588, "grad_norm": 16.022008680528746, "learning_rate": 5.621808253631624e-08, "logits/chosen": -2.90625, "logits/rejected": -3.109375, "logps/chosen": -640.0, "logps/rejected": -1168.0, "loss": 0.2105, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 5.28125, "rewards/rejected": -9.9375, "step": 11620 }, { "epoch": 0.8047885959449174, "grad_norm": 18.657763678758123, "learning_rate": 5.5837088762255875e-08, "logits/chosen": -2.9375, "logits/rejected": -3.4375, "logps/chosen": -640.0, "logps/rejected": -1152.0, "loss": 0.2332, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 5.46875, "rewards/rejected": -10.125, "step": 11630 }, { "epoch": 0.8054805895785758, "grad_norm": 22.508185471612748, "learning_rate": 5.5457228027814965e-08, "logits/chosen": -2.875, "logits/rejected": -3.3125, "logps/chosen": -620.0, "logps/rejected": -1112.0, "loss": 0.229, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5, "rewards/margins": 5.15625, "rewards/rejected": -9.625, "step": 11640 }, { "epoch": 0.8061725832122344, "grad_norm": 25.342098189137186, "learning_rate": 5.5078502549674414e-08, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -664.0, "logps/rejected": -1176.0, "loss": 0.2268, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 11650 }, { "epoch": 0.806864576845893, "grad_norm": 14.79327031997703, "learning_rate": 5.470091453789064e-08, "logits/chosen": -2.75, "logits/rejected": -3.109375, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2494, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.15625, "rewards/rejected": -10.0625, "step": 11660 }, { "epoch": 0.8075565704795515, "grad_norm": 22.19495397161271, "learning_rate": 5.4324466195882186e-08, "logits/chosen": -2.78125, "logits/rejected": -3.203125, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.2375, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.25, "rewards/rejected": -9.9375, "step": 11670 }, { "epoch": 0.8082485641132101, "grad_norm": 21.79307524738152, "learning_rate": 5.394915972041739e-08, "logits/chosen": -2.90625, "logits/rejected": -2.953125, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.2484, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 11680 }, { "epoch": 0.8089405577468687, "grad_norm": 20.20085888900626, "learning_rate": 5.35749973016007e-08, "logits/chosen": -3.0, "logits/rejected": -3.140625, "logps/chosen": -644.0, "logps/rejected": -1136.0, "loss": 0.2184, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.96875, "rewards/rejected": -9.75, "step": 11690 }, { "epoch": 0.8096325513805273, "grad_norm": 21.336344435756537, "learning_rate": 5.320198112286076e-08, "logits/chosen": -2.96875, "logits/rejected": -3.34375, "logps/chosen": -648.0, "logps/rejected": -1104.0, "loss": 0.2288, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.71875, "rewards/rejected": -9.625, "step": 11700 }, { "epoch": 0.8103245450141858, "grad_norm": 21.805661319470676, "learning_rate": 5.283011336093726e-08, "logits/chosen": -2.78125, "logits/rejected": -3.15625, "logps/chosen": -636.0, "logps/rejected": -1160.0, "loss": 0.1953, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.59375, "rewards/margins": 5.34375, "rewards/rejected": -9.9375, "step": 11710 }, { "epoch": 0.8110165386478444, "grad_norm": 26.509433426968045, "learning_rate": 5.24593961858682e-08, "logits/chosen": -2.90625, "logits/rejected": -2.9375, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.2292, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 11720 }, { "epoch": 0.811708532281503, "grad_norm": 20.463588938348963, "learning_rate": 5.208983176097742e-08, "logits/chosen": -2.9375, "logits/rejected": -3.21875, "logps/chosen": -652.0, "logps/rejected": -1176.0, "loss": 0.2089, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.34375, "rewards/rejected": -10.1875, "step": 11730 }, { "epoch": 0.8124005259151615, "grad_norm": 18.495153051464545, "learning_rate": 5.172142224286183e-08, "logits/chosen": -2.84375, "logits/rejected": -3.125, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2361, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.375, "rewards/rejected": -10.375, "step": 11740 }, { "epoch": 0.8130925195488201, "grad_norm": 23.604340412122784, "learning_rate": 5.135416978137888e-08, "logits/chosen": -2.84375, "logits/rejected": -3.25, "logps/chosen": -624.0, "logps/rejected": -1136.0, "loss": 0.2407, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.625, "rewards/margins": 5.15625, "rewards/rejected": -9.75, "step": 11750 }, { "epoch": 0.8137845131824787, "grad_norm": 19.938607955212646, "learning_rate": 5.0988076519633965e-08, "logits/chosen": -2.828125, "logits/rejected": -3.046875, "logps/chosen": -612.0, "logps/rejected": -1120.0, "loss": 0.1955, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.53125, "rewards/margins": 4.9375, "rewards/rejected": -9.5, "step": 11760 }, { "epoch": 0.8144765068161373, "grad_norm": 19.836034167626956, "learning_rate": 5.0623144593968106e-08, "logits/chosen": -2.875, "logits/rejected": -3.09375, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.2243, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 11770 }, { "epoch": 0.8151685004497958, "grad_norm": 22.321235934189602, "learning_rate": 5.025937613394499e-08, "logits/chosen": -2.84375, "logits/rejected": -3.3125, "logps/chosen": -636.0, "logps/rejected": -1136.0, "loss": 0.2195, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.34375, "rewards/rejected": -9.875, "step": 11780 }, { "epoch": 0.8158604940834544, "grad_norm": 36.161334858242554, "learning_rate": 4.989677326233935e-08, "logits/chosen": -2.9375, "logits/rejected": -3.328125, "logps/chosen": -660.0, "logps/rejected": -1152.0, "loss": 0.2074, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.125, "rewards/rejected": -9.9375, "step": 11790 }, { "epoch": 0.816552487717113, "grad_norm": 37.33185943829899, "learning_rate": 4.953533809512383e-08, "logits/chosen": -2.890625, "logits/rejected": -3.109375, "logps/chosen": -684.0, "logps/rejected": -1144.0, "loss": 0.2308, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 4.90625, "rewards/rejected": -9.8125, "step": 11800 }, { "epoch": 0.8172444813507715, "grad_norm": 21.886405715607946, "learning_rate": 4.9175072741456906e-08, "logits/chosen": -2.78125, "logits/rejected": -3.078125, "logps/chosen": -640.0, "logps/rejected": -1152.0, "loss": 0.2264, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.4375, "rewards/margins": 5.46875, "rewards/rejected": -9.9375, "step": 11810 }, { "epoch": 0.8179364749844301, "grad_norm": 28.04955797955008, "learning_rate": 4.881597930367065e-08, "logits/chosen": -2.78125, "logits/rejected": -3.1875, "logps/chosen": -636.0, "logps/rejected": -1176.0, "loss": 0.1981, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.6875, "rewards/margins": 5.5, "rewards/rejected": -10.1875, "step": 11820 }, { "epoch": 0.8186284686180887, "grad_norm": 22.196646727304664, "learning_rate": 4.8458059877258595e-08, "logits/chosen": -2.9375, "logits/rejected": -3.109375, "logps/chosen": -696.0, "logps/rejected": -1184.0, "loss": 0.2208, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.15625, "rewards/rejected": -10.25, "step": 11830 }, { "epoch": 0.8193204622517473, "grad_norm": 25.676225857648472, "learning_rate": 4.8101316550863e-08, "logits/chosen": -2.828125, "logits/rejected": -3.140625, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.2032, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 5.5, "rewards/rejected": -10.25, "step": 11840 }, { "epoch": 0.8200124558854058, "grad_norm": 20.573493355704326, "learning_rate": 4.774575140626316e-08, "logits/chosen": -2.734375, "logits/rejected": -3.046875, "logps/chosen": -700.0, "logps/rejected": -1216.0, "loss": 0.2385, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.375, "rewards/rejected": -10.375, "step": 11850 }, { "epoch": 0.8207044495190644, "grad_norm": 24.819495644877374, "learning_rate": 4.739136651836306e-08, "logits/chosen": -2.953125, "logits/rejected": -3.3125, "logps/chosen": -640.0, "logps/rejected": -1168.0, "loss": 0.2214, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 11860 }, { "epoch": 0.821396443152723, "grad_norm": 23.91218397349646, "learning_rate": 4.703816395517929e-08, "logits/chosen": -2.78125, "logits/rejected": -3.203125, "logps/chosen": -688.0, "logps/rejected": -1208.0, "loss": 0.2416, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.3125, "rewards/rejected": -10.375, "step": 11870 }, { "epoch": 0.8220884367863815, "grad_norm": 19.97690411263416, "learning_rate": 4.668614577782892e-08, "logits/chosen": -2.765625, "logits/rejected": -3.09375, "logps/chosen": -656.0, "logps/rejected": -1152.0, "loss": 0.2096, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.15625, "rewards/rejected": -9.9375, "step": 11880 }, { "epoch": 0.8227804304200401, "grad_norm": 31.298988068347416, "learning_rate": 4.63353140405176e-08, "logits/chosen": -2.859375, "logits/rejected": -3.15625, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.2334, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.03125, "rewards/rejected": -10.0, "step": 11890 }, { "epoch": 0.8234724240536987, "grad_norm": 19.304580903442044, "learning_rate": 4.5985670790527425e-08, "logits/chosen": -3.0, "logits/rejected": -3.21875, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.217, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 11900 }, { "epoch": 0.8241644176873573, "grad_norm": 21.780862497210407, "learning_rate": 4.5637218068205115e-08, "logits/chosen": -2.984375, "logits/rejected": -3.125, "logps/chosen": -672.0, "logps/rejected": -1136.0, "loss": 0.2312, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.71875, "rewards/rejected": -9.6875, "step": 11910 }, { "epoch": 0.8248564113210158, "grad_norm": 18.978504930317655, "learning_rate": 4.528995790695012e-08, "logits/chosen": -2.78125, "logits/rejected": -3.0625, "logps/chosen": -684.0, "logps/rejected": -1152.0, "loss": 0.1799, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 4.9375, "rewards/rejected": -9.9375, "step": 11920 }, { "epoch": 0.8255484049546744, "grad_norm": 20.439849614831363, "learning_rate": 4.494389233320234e-08, "logits/chosen": -3.0, "logits/rejected": -3.46875, "logps/chosen": -640.0, "logps/rejected": -1120.0, "loss": 0.2436, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 4.9375, "rewards/rejected": -9.6875, "step": 11930 }, { "epoch": 0.826240398588333, "grad_norm": 20.036923921675527, "learning_rate": 4.459902336643115e-08, "logits/chosen": -2.84375, "logits/rejected": -3.03125, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2193, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 5.125, "rewards/rejected": -9.8125, "step": 11940 }, { "epoch": 0.8269323922219916, "grad_norm": 26.58365187274381, "learning_rate": 4.425535301912284e-08, "logits/chosen": -2.96875, "logits/rejected": -3.015625, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.2317, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 5.03125, "rewards/rejected": -9.9375, "step": 11950 }, { "epoch": 0.8276243858556501, "grad_norm": 19.089280970994867, "learning_rate": 4.3912883296769094e-08, "logits/chosen": -2.828125, "logits/rejected": -3.03125, "logps/chosen": -668.0, "logps/rejected": -1160.0, "loss": 0.2571, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.21875, "rewards/rejected": -10.0, "step": 11960 }, { "epoch": 0.8283163794893087, "grad_norm": 18.071267795665683, "learning_rate": 4.3571616197855374e-08, "logits/chosen": -2.8125, "logits/rejected": -3.0625, "logps/chosen": -648.0, "logps/rejected": -1136.0, "loss": 0.1989, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 5.09375, "rewards/rejected": -9.75, "step": 11970 }, { "epoch": 0.8290083731229673, "grad_norm": 13.629425060297093, "learning_rate": 4.3231553713849456e-08, "logits/chosen": -2.796875, "logits/rejected": -3.21875, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.1862, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.59375, "rewards/margins": 5.25, "rewards/rejected": -9.875, "step": 11980 }, { "epoch": 0.8297003667566258, "grad_norm": 29.260409769534025, "learning_rate": 4.2892697829189224e-08, "logits/chosen": -2.921875, "logits/rejected": -3.15625, "logps/chosen": -668.0, "logps/rejected": -1184.0, "loss": 0.1862, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 5.28125, "rewards/rejected": -10.25, "step": 11990 }, { "epoch": 0.8303923603902844, "grad_norm": 27.792175001826315, "learning_rate": 4.255505052127165e-08, "logits/chosen": -2.90625, "logits/rejected": -3.109375, "logps/chosen": -688.0, "logps/rejected": -1144.0, "loss": 0.2324, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.78125, "rewards/rejected": -9.875, "step": 12000 }, { "epoch": 0.8303923603902844, "eval_logits/chosen": -2.90625, "eval_logits/rejected": -3.171875, "eval_logps/chosen": -700.0, "eval_logps/rejected": -1112.0, "eval_loss": 0.22875253856182098, "eval_rewards/accuracies": 0.8971549868583679, "eval_rewards/chosen": -5.125, "eval_rewards/margins": 4.375, "eval_rewards/rejected": -9.5, "eval_runtime": 2940.5113, "eval_samples_per_second": 33.271, "eval_steps_per_second": 0.52, "step": 12000 }, { "epoch": 0.831084354023943, "grad_norm": 22.891603696140532, "learning_rate": 4.221861376044095e-08, "logits/chosen": -2.9375, "logits/rejected": -3.171875, "logps/chosen": -644.0, "logps/rejected": -1160.0, "loss": 0.199, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.34375, "rewards/rejected": -9.9375, "step": 12010 }, { "epoch": 0.8317763476576016, "grad_norm": 24.68734163580455, "learning_rate": 4.1883389509977276e-08, "logits/chosen": -2.9375, "logits/rejected": -3.0625, "logps/chosen": -660.0, "logps/rejected": -1216.0, "loss": 0.2201, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.53125, "rewards/rejected": -10.375, "step": 12020 }, { "epoch": 0.8324683412912601, "grad_norm": 26.79499616209037, "learning_rate": 4.15493797260851e-08, "logits/chosen": -2.921875, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.241, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 4.90625, "rewards/rejected": -9.75, "step": 12030 }, { "epoch": 0.8331603349249187, "grad_norm": 26.037062348500903, "learning_rate": 4.1216586357881834e-08, "logits/chosen": -2.859375, "logits/rejected": -3.03125, "logps/chosen": -656.0, "logps/rejected": -1192.0, "loss": 0.2167, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.46875, "rewards/rejected": -10.1875, "step": 12040 }, { "epoch": 0.8338523285585773, "grad_norm": 21.607225306316103, "learning_rate": 4.088501134738656e-08, "logits/chosen": -2.875, "logits/rejected": -3.4375, "logps/chosen": -632.0, "logps/rejected": -1184.0, "loss": 0.1784, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.59375, "rewards/margins": 5.6875, "rewards/rejected": -10.3125, "step": 12050 }, { "epoch": 0.8345443221922358, "grad_norm": 26.450643736403865, "learning_rate": 4.055465662950858e-08, "logits/chosen": -2.953125, "logits/rejected": -3.3125, "logps/chosen": -656.0, "logps/rejected": -1144.0, "loss": 0.2268, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 12060 }, { "epoch": 0.8352363158258944, "grad_norm": 24.201781628758845, "learning_rate": 4.022552413203622e-08, "logits/chosen": -2.921875, "logits/rejected": -3.25, "logps/chosen": -644.0, "logps/rejected": -1184.0, "loss": 0.1937, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.46875, "rewards/rejected": -10.25, "step": 12070 }, { "epoch": 0.835928309459553, "grad_norm": 19.501514103222473, "learning_rate": 3.989761577562531e-08, "logits/chosen": -3.015625, "logits/rejected": -3.046875, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2155, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.28125, "rewards/rejected": -10.125, "step": 12080 }, { "epoch": 0.8366203030932116, "grad_norm": 24.60915802870861, "learning_rate": 3.957093347378851e-08, "logits/chosen": -2.875, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1168.0, "loss": 0.2192, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.40625, "rewards/rejected": -10.0625, "step": 12090 }, { "epoch": 0.8373122967268701, "grad_norm": 24.629520165548332, "learning_rate": 3.9245479132883676e-08, "logits/chosen": -3.0, "logits/rejected": -3.125, "logps/chosen": -648.0, "logps/rejected": -1120.0, "loss": 0.226, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.65625, "rewards/margins": 4.8125, "rewards/rejected": -9.5, "step": 12100 }, { "epoch": 0.8380042903605287, "grad_norm": 18.843328719137617, "learning_rate": 3.89212546521028e-08, "logits/chosen": -2.984375, "logits/rejected": -3.234375, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.2143, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.09375, "rewards/margins": 4.875, "rewards/rejected": -9.9375, "step": 12110 }, { "epoch": 0.8386962839941873, "grad_norm": 16.80062340931521, "learning_rate": 3.859826192346108e-08, "logits/chosen": -3.0, "logits/rejected": -3.375, "logps/chosen": -656.0, "logps/rejected": -1120.0, "loss": 0.2185, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.875, "rewards/rejected": -9.625, "step": 12120 }, { "epoch": 0.8393882776278458, "grad_norm": 19.497537566354442, "learning_rate": 3.827650283178599e-08, "logits/chosen": -3.015625, "logits/rejected": -3.265625, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.2189, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.0, "rewards/rejected": -9.8125, "step": 12130 }, { "epoch": 0.8400802712615044, "grad_norm": 26.257036001697536, "learning_rate": 3.7955979254705754e-08, "logits/chosen": -2.859375, "logits/rejected": -3.0, "logps/chosen": -648.0, "logps/rejected": -1216.0, "loss": 0.1915, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.71875, "rewards/rejected": -10.4375, "step": 12140 }, { "epoch": 0.840772264895163, "grad_norm": 28.46049682162109, "learning_rate": 3.763669306263906e-08, "logits/chosen": -3.0, "logits/rejected": -3.40625, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.228, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.5625, "rewards/rejected": -10.375, "step": 12150 }, { "epoch": 0.8414642585288216, "grad_norm": 25.406593017858075, "learning_rate": 3.7318646118783604e-08, "logits/chosen": -2.921875, "logits/rejected": -3.171875, "logps/chosen": -680.0, "logps/rejected": -1184.0, "loss": 0.2211, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.125, "rewards/margins": 5.0, "rewards/rejected": -10.125, "step": 12160 }, { "epoch": 0.8421562521624801, "grad_norm": 28.0957442749456, "learning_rate": 3.700184027910555e-08, "logits/chosen": -2.921875, "logits/rejected": -3.265625, "logps/chosen": -632.0, "logps/rejected": -1144.0, "loss": 0.2166, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.28125, "rewards/rejected": -9.9375, "step": 12170 }, { "epoch": 0.8428482457961387, "grad_norm": 32.89516839349263, "learning_rate": 3.66862773923286e-08, "logits/chosen": -2.921875, "logits/rejected": -3.390625, "logps/chosen": -636.0, "logps/rejected": -1128.0, "loss": 0.1811, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.65625, "rewards/margins": 5.09375, "rewards/rejected": -9.75, "step": 12180 }, { "epoch": 0.8435402394297973, "grad_norm": 16.467045530205354, "learning_rate": 3.63719592999231e-08, "logits/chosen": -2.9375, "logits/rejected": -3.171875, "logps/chosen": -624.0, "logps/rejected": -1160.0, "loss": 0.2348, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.53125, "rewards/margins": 5.4375, "rewards/rejected": -10.0, "step": 12190 }, { "epoch": 0.8442322330634559, "grad_norm": 27.98474143340558, "learning_rate": 3.6058887836095455e-08, "logits/chosen": -2.953125, "logits/rejected": -3.265625, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2295, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.9375, "rewards/rejected": -9.75, "step": 12200 }, { "epoch": 0.8449242266971144, "grad_norm": 21.570327863075683, "learning_rate": 3.574706482777731e-08, "logits/chosen": -3.0, "logits/rejected": -3.203125, "logps/chosen": -648.0, "logps/rejected": -1136.0, "loss": 0.2198, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.96875, "rewards/rejected": -9.9375, "step": 12210 }, { "epoch": 0.845616220330773, "grad_norm": 23.250592218755333, "learning_rate": 3.543649209461508e-08, "logits/chosen": -2.953125, "logits/rejected": -3.21875, "logps/chosen": -668.0, "logps/rejected": -1128.0, "loss": 0.2231, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 4.71875, "rewards/rejected": -9.5625, "step": 12220 }, { "epoch": 0.8463082139644316, "grad_norm": 22.861760436973693, "learning_rate": 3.512717144895883e-08, "logits/chosen": -2.921875, "logits/rejected": -3.046875, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.2321, "rewards/accuracies": 0.90625, "rewards/chosen": -4.96875, "rewards/margins": 5.03125, "rewards/rejected": -10.0, "step": 12230 }, { "epoch": 0.8470002075980901, "grad_norm": 24.91187929607771, "learning_rate": 3.481910469585253e-08, "logits/chosen": -2.875, "logits/rejected": -3.4375, "logps/chosen": -628.0, "logps/rejected": -1136.0, "loss": 0.2417, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.25, "rewards/rejected": -9.9375, "step": 12240 }, { "epoch": 0.8476922012317487, "grad_norm": 26.4994978508312, "learning_rate": 3.451229363302277e-08, "logits/chosen": -2.921875, "logits/rejected": -3.328125, "logps/chosen": -652.0, "logps/rejected": -1128.0, "loss": 0.2421, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.65625, "rewards/margins": 5.0625, "rewards/rejected": -9.75, "step": 12250 }, { "epoch": 0.8483841948654073, "grad_norm": 20.097521289433484, "learning_rate": 3.420674005086849e-08, "logits/chosen": -3.0, "logits/rejected": -3.171875, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2237, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.40625, "rewards/rejected": -10.125, "step": 12260 }, { "epoch": 0.8490761884990659, "grad_norm": 25.651376142797282, "learning_rate": 3.3902445732450685e-08, "logits/chosen": -2.9375, "logits/rejected": -3.265625, "logps/chosen": -640.0, "logps/rejected": -1128.0, "loss": 0.2203, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.03125, "rewards/rejected": -9.8125, "step": 12270 }, { "epoch": 0.8497681821327244, "grad_norm": 12.795303661025814, "learning_rate": 3.359941245348208e-08, "logits/chosen": -2.953125, "logits/rejected": -3.359375, "logps/chosen": -668.0, "logps/rejected": -1160.0, "loss": 0.2362, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 4.9375, "rewards/rejected": -9.9375, "step": 12280 }, { "epoch": 0.850460175766383, "grad_norm": 12.9496763818367, "learning_rate": 3.3297641982316245e-08, "logits/chosen": -2.984375, "logits/rejected": -3.484375, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.2305, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 12290 }, { "epoch": 0.8511521694000416, "grad_norm": 21.87164203963982, "learning_rate": 3.299713607993787e-08, "logits/chosen": -2.859375, "logits/rejected": -3.25, "logps/chosen": -672.0, "logps/rejected": -1112.0, "loss": 0.2004, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.84375, "rewards/margins": 4.6875, "rewards/rejected": -9.5, "step": 12300 }, { "epoch": 0.8518441630337001, "grad_norm": 28.626027207071953, "learning_rate": 3.269789649995214e-08, "logits/chosen": -2.96875, "logits/rejected": -3.28125, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.1969, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 12310 }, { "epoch": 0.8525361566673587, "grad_norm": 19.52742408245844, "learning_rate": 3.239992498857466e-08, "logits/chosen": -2.90625, "logits/rejected": -3.125, "logps/chosen": -632.0, "logps/rejected": -1144.0, "loss": 0.2509, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 5.3125, "rewards/rejected": -9.9375, "step": 12320 }, { "epoch": 0.8532281503010173, "grad_norm": 12.920144829726059, "learning_rate": 3.2103223284621144e-08, "logits/chosen": -2.84375, "logits/rejected": -2.953125, "logps/chosen": -648.0, "logps/rejected": -1192.0, "loss": 0.2134, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 5.53125, "rewards/rejected": -10.25, "step": 12330 }, { "epoch": 0.8539201439346757, "grad_norm": 17.319622281398356, "learning_rate": 3.180779311949741e-08, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -636.0, "logps/rejected": -1096.0, "loss": 0.1813, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 4.78125, "rewards/rejected": -9.4375, "step": 12340 }, { "epoch": 0.8546121375683343, "grad_norm": 28.41566634839036, "learning_rate": 3.151363621718911e-08, "logits/chosen": -2.890625, "logits/rejected": -3.234375, "logps/chosen": -656.0, "logps/rejected": -1120.0, "loss": 0.2545, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 4.9375, "rewards/rejected": -9.6875, "step": 12350 }, { "epoch": 0.8553041312019929, "grad_norm": 31.21819686976598, "learning_rate": 3.1220754294251834e-08, "logits/chosen": -2.953125, "logits/rejected": -3.28125, "logps/chosen": -624.0, "logps/rejected": -1136.0, "loss": 0.2415, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.5625, "rewards/margins": 5.28125, "rewards/rejected": -9.875, "step": 12360 }, { "epoch": 0.8559961248356515, "grad_norm": 29.787868789205135, "learning_rate": 3.0929149059800986e-08, "logits/chosen": -2.921875, "logits/rejected": -3.296875, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.25, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 12370 }, { "epoch": 0.85668811846931, "grad_norm": 28.291888797393703, "learning_rate": 3.0638822215501795e-08, "logits/chosen": -2.953125, "logits/rejected": -3.328125, "logps/chosen": -672.0, "logps/rejected": -1160.0, "loss": 0.2066, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 12380 }, { "epoch": 0.8573801121029686, "grad_norm": 26.86303268677775, "learning_rate": 3.034977545555956e-08, "logits/chosen": -2.828125, "logits/rejected": -3.09375, "logps/chosen": -656.0, "logps/rejected": -1144.0, "loss": 0.2456, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 12390 }, { "epoch": 0.8580721057366272, "grad_norm": 27.97358223423886, "learning_rate": 3.0062010466709474e-08, "logits/chosen": -2.96875, "logits/rejected": -3.390625, "logps/chosen": -648.0, "logps/rejected": -1176.0, "loss": 0.2465, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.3125, "rewards/rejected": -10.125, "step": 12400 }, { "epoch": 0.8587640993702857, "grad_norm": 22.966093885152542, "learning_rate": 2.9775528928207115e-08, "logits/chosen": -2.890625, "logits/rejected": -3.328125, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.234, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 4.9375, "rewards/rejected": -9.8125, "step": 12410 }, { "epoch": 0.8594560930039443, "grad_norm": 20.90080893670045, "learning_rate": 2.949033251181826e-08, "logits/chosen": -2.96875, "logits/rejected": -3.15625, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.2294, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.21875, "rewards/rejected": -10.125, "step": 12420 }, { "epoch": 0.8601480866376029, "grad_norm": 27.05745011460886, "learning_rate": 2.9206422881809466e-08, "logits/chosen": -2.9375, "logits/rejected": -3.15625, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.2151, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 4.96875, "rewards/rejected": -9.75, "step": 12430 }, { "epoch": 0.8608400802712615, "grad_norm": 16.5151166449589, "learning_rate": 2.892380169493841e-08, "logits/chosen": -2.921875, "logits/rejected": -3.4375, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.1863, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.21875, "rewards/rejected": -10.125, "step": 12440 }, { "epoch": 0.86153207390492, "grad_norm": 29.09485267106005, "learning_rate": 2.864247060044367e-08, "logits/chosen": -2.9375, "logits/rejected": -3.140625, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2007, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 4.90625, "rewards/rejected": -9.9375, "step": 12450 }, { "epoch": 0.8622240675385786, "grad_norm": 12.152290664634554, "learning_rate": 2.836243124003576e-08, "logits/chosen": -2.890625, "logits/rejected": -3.1875, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.2161, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 4.96875, "rewards/rejected": -9.8125, "step": 12460 }, { "epoch": 0.8629160611722372, "grad_norm": 12.822389167419349, "learning_rate": 2.808368524788715e-08, "logits/chosen": -2.953125, "logits/rejected": -2.765625, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.176, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 5.09375, "rewards/rejected": -9.9375, "step": 12470 }, { "epoch": 0.8636080548058958, "grad_norm": 24.945342270626483, "learning_rate": 2.7806234250622894e-08, "logits/chosen": -3.09375, "logits/rejected": -3.203125, "logps/chosen": -688.0, "logps/rejected": -1144.0, "loss": 0.204, "rewards/accuracies": 0.90625, "rewards/chosen": -5.125, "rewards/margins": 4.71875, "rewards/rejected": -9.875, "step": 12480 }, { "epoch": 0.8643000484395543, "grad_norm": 19.530571190689546, "learning_rate": 2.7530079867311096e-08, "logits/chosen": -2.875, "logits/rejected": -3.0625, "logps/chosen": -640.0, "logps/rejected": -1144.0, "loss": 0.2134, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 12490 }, { "epoch": 0.8649920420732129, "grad_norm": 28.952959474410275, "learning_rate": 2.7255223709453406e-08, "logits/chosen": -2.6875, "logits/rejected": -3.046875, "logps/chosen": -692.0, "logps/rejected": -1184.0, "loss": 0.2048, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.3125, "rewards/rejected": -10.1875, "step": 12500 }, { "epoch": 0.8656840357068715, "grad_norm": 12.370057243316928, "learning_rate": 2.698166738097571e-08, "logits/chosen": -2.984375, "logits/rejected": -3.21875, "logps/chosen": -644.0, "logps/rejected": -1184.0, "loss": 0.2179, "rewards/accuracies": 0.96875, "rewards/chosen": -4.84375, "rewards/margins": 5.34375, "rewards/rejected": -10.1875, "step": 12510 }, { "epoch": 0.86637602934053, "grad_norm": 22.934360666673395, "learning_rate": 2.6709412478218702e-08, "logits/chosen": -2.921875, "logits/rejected": -3.390625, "logps/chosen": -612.0, "logps/rejected": -1160.0, "loss": 0.1921, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.53125, "rewards/margins": 5.40625, "rewards/rejected": -9.9375, "step": 12520 }, { "epoch": 0.8670680229741886, "grad_norm": 30.4778385516906, "learning_rate": 2.643846058992866e-08, "logits/chosen": -2.921875, "logits/rejected": -3.375, "logps/chosen": -656.0, "logps/rejected": -1120.0, "loss": 0.2251, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.84375, "rewards/margins": 4.8125, "rewards/rejected": -9.6875, "step": 12530 }, { "epoch": 0.8677600166078472, "grad_norm": 18.878333472536482, "learning_rate": 2.6168813297247877e-08, "logits/chosen": -2.84375, "logits/rejected": -3.015625, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2286, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 12540 }, { "epoch": 0.8684520102415058, "grad_norm": 24.65794454313523, "learning_rate": 2.5900472173706028e-08, "logits/chosen": -2.90625, "logits/rejected": -3.15625, "logps/chosen": -680.0, "logps/rejected": -1160.0, "loss": 0.2189, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.96875, "rewards/rejected": -9.9375, "step": 12550 }, { "epoch": 0.8691440038751643, "grad_norm": 26.74325906329408, "learning_rate": 2.5633438785210404e-08, "logits/chosen": -2.890625, "logits/rejected": -3.203125, "logps/chosen": -620.0, "logps/rejected": -1152.0, "loss": 0.2234, "rewards/accuracies": 0.96875, "rewards/chosen": -4.5625, "rewards/margins": 5.40625, "rewards/rejected": -10.0, "step": 12560 }, { "epoch": 0.8698359975088229, "grad_norm": 22.015057866847698, "learning_rate": 2.536771469003693e-08, "logits/chosen": -2.859375, "logits/rejected": -3.0625, "logps/chosen": -652.0, "logps/rejected": -1224.0, "loss": 0.2071, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.46875, "rewards/rejected": -10.3125, "step": 12570 }, { "epoch": 0.8705279911424815, "grad_norm": 22.22230249372207, "learning_rate": 2.510330143882125e-08, "logits/chosen": -2.796875, "logits/rejected": -3.234375, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.209, "rewards/accuracies": 0.9375, "rewards/chosen": -4.65625, "rewards/margins": 5.34375, "rewards/rejected": -10.0, "step": 12580 }, { "epoch": 0.87121998477614, "grad_norm": 18.727754785245423, "learning_rate": 2.484020057454969e-08, "logits/chosen": -2.953125, "logits/rejected": -3.359375, "logps/chosen": -640.0, "logps/rejected": -1176.0, "loss": 0.1908, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 5.46875, "rewards/rejected": -10.1875, "step": 12590 }, { "epoch": 0.8719119784097986, "grad_norm": 10.070015050590932, "learning_rate": 2.4578413632549838e-08, "logits/chosen": -2.828125, "logits/rejected": -2.984375, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.1829, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.4375, "rewards/rejected": -10.125, "step": 12600 }, { "epoch": 0.8726039720434572, "grad_norm": 16.62302357334876, "learning_rate": 2.431794214048205e-08, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2104, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.34375, "rewards/rejected": -10.0, "step": 12610 }, { "epoch": 0.8732959656771158, "grad_norm": 27.89107956647082, "learning_rate": 2.4058787618330384e-08, "logits/chosen": -2.953125, "logits/rejected": -3.171875, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.2096, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.09375, "rewards/rejected": -9.8125, "step": 12620 }, { "epoch": 0.8739879593107743, "grad_norm": 18.004194040848805, "learning_rate": 2.3800951578393596e-08, "logits/chosen": -2.96875, "logits/rejected": -3.171875, "logps/chosen": -628.0, "logps/rejected": -1144.0, "loss": 0.2297, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.65625, "rewards/margins": 5.15625, "rewards/rejected": -9.8125, "step": 12630 }, { "epoch": 0.8746799529444329, "grad_norm": 23.759017225859854, "learning_rate": 2.3544435525276546e-08, "logits/chosen": -2.84375, "logits/rejected": -3.21875, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.1822, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 5.03125, "rewards/rejected": -9.75, "step": 12640 }, { "epoch": 0.8753719465780915, "grad_norm": 15.310029937118756, "learning_rate": 2.3289240955881222e-08, "logits/chosen": -2.859375, "logits/rejected": -3.1875, "logps/chosen": -668.0, "logps/rejected": -1256.0, "loss": 0.1911, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.9375, "rewards/rejected": -10.75, "step": 12650 }, { "epoch": 0.87606394021175, "grad_norm": 18.02687533065379, "learning_rate": 2.3035369359398067e-08, "logits/chosen": -2.796875, "logits/rejected": -3.28125, "logps/chosen": -628.0, "logps/rejected": -1128.0, "loss": 0.1887, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.53125, "rewards/margins": 5.1875, "rewards/rejected": -9.75, "step": 12660 }, { "epoch": 0.8767559338454086, "grad_norm": 21.41925179921425, "learning_rate": 2.278282221729738e-08, "logits/chosen": -2.90625, "logits/rejected": -3.171875, "logps/chosen": -644.0, "logps/rejected": -1200.0, "loss": 0.2069, "rewards/accuracies": 0.96875, "rewards/chosen": -4.5, "rewards/margins": 5.8125, "rewards/rejected": -10.3125, "step": 12670 }, { "epoch": 0.8774479274790672, "grad_norm": 27.98104900812072, "learning_rate": 2.253160100332055e-08, "logits/chosen": -2.859375, "logits/rejected": -2.96875, "logps/chosen": -652.0, "logps/rejected": -1240.0, "loss": 0.1997, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.875, "rewards/rejected": -10.5625, "step": 12680 }, { "epoch": 0.8781399211127258, "grad_norm": 24.373132913126902, "learning_rate": 2.2281707183471386e-08, "logits/chosen": -2.953125, "logits/rejected": -3.34375, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2318, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.125, "rewards/rejected": -9.875, "step": 12690 }, { "epoch": 0.8788319147463843, "grad_norm": 31.375461711951022, "learning_rate": 2.2033142216007912e-08, "logits/chosen": -2.765625, "logits/rejected": -3.203125, "logps/chosen": -664.0, "logps/rejected": -1216.0, "loss": 0.1945, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.625, "rewards/rejected": -10.4375, "step": 12700 }, { "epoch": 0.8795239083800429, "grad_norm": 25.557503641720697, "learning_rate": 2.1785907551433474e-08, "logits/chosen": -2.90625, "logits/rejected": -3.28125, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2199, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.28125, "rewards/rejected": -10.125, "step": 12710 }, { "epoch": 0.8802159020137015, "grad_norm": 26.325894201882818, "learning_rate": 2.1540004632488396e-08, "logits/chosen": -2.84375, "logits/rejected": -3.25, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.1967, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 12720 }, { "epoch": 0.88090789564736, "grad_norm": 30.275418063162316, "learning_rate": 2.1295434894141594e-08, "logits/chosen": -2.828125, "logits/rejected": -3.265625, "logps/chosen": -644.0, "logps/rejected": -1160.0, "loss": 0.2019, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.65625, "rewards/margins": 5.375, "rewards/rejected": -10.0, "step": 12730 }, { "epoch": 0.8815998892810186, "grad_norm": 26.323092924536027, "learning_rate": 2.1052199763582357e-08, "logits/chosen": -2.796875, "logits/rejected": -3.265625, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.2029, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.125, "rewards/rejected": -9.9375, "step": 12740 }, { "epoch": 0.8822918829146772, "grad_norm": 22.87761674176511, "learning_rate": 2.0810300660211667e-08, "logits/chosen": -2.84375, "logits/rejected": -3.140625, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.2057, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.375, "rewards/rejected": -10.125, "step": 12750 }, { "epoch": 0.8829838765483358, "grad_norm": 23.76212114379537, "learning_rate": 2.0569738995634135e-08, "logits/chosen": -2.96875, "logits/rejected": -3.171875, "logps/chosen": -636.0, "logps/rejected": -1168.0, "loss": 0.1912, "rewards/accuracies": 0.96875, "rewards/chosen": -4.59375, "rewards/margins": 5.40625, "rewards/rejected": -10.0, "step": 12760 }, { "epoch": 0.8836758701819943, "grad_norm": 25.955328385265332, "learning_rate": 2.0330516173649807e-08, "logits/chosen": -2.953125, "logits/rejected": -3.328125, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.2223, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 5.03125, "rewards/rejected": -10.0, "step": 12770 }, { "epoch": 0.8843678638156529, "grad_norm": 23.863673766819467, "learning_rate": 2.009263359024588e-08, "logits/chosen": -2.96875, "logits/rejected": -3.09375, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.2564, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.90625, "rewards/rejected": -10.0, "step": 12780 }, { "epoch": 0.8850598574493115, "grad_norm": 25.683717742794876, "learning_rate": 1.985609263358856e-08, "logits/chosen": -2.953125, "logits/rejected": -3.3125, "logps/chosen": -684.0, "logps/rejected": -1136.0, "loss": 0.2233, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 4.71875, "rewards/rejected": -9.75, "step": 12790 }, { "epoch": 0.88575185108297, "grad_norm": 25.712806703694355, "learning_rate": 1.962089468401493e-08, "logits/chosen": -2.796875, "logits/rejected": -3.140625, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.224, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.65625, "rewards/margins": 5.4375, "rewards/rejected": -10.125, "step": 12800 }, { "epoch": 0.8864438447166286, "grad_norm": 23.36306137776369, "learning_rate": 1.938704111402503e-08, "logits/chosen": -2.96875, "logits/rejected": -3.21875, "logps/chosen": -632.0, "logps/rejected": -1152.0, "loss": 0.2037, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 5.1875, "rewards/rejected": -9.875, "step": 12810 }, { "epoch": 0.8871358383502872, "grad_norm": 19.872948915263603, "learning_rate": 1.9154533288273713e-08, "logits/chosen": -2.90625, "logits/rejected": -3.28125, "logps/chosen": -700.0, "logps/rejected": -1160.0, "loss": 0.2166, "rewards/accuracies": 0.96875, "rewards/chosen": -5.09375, "rewards/margins": 4.90625, "rewards/rejected": -10.0, "step": 12820 }, { "epoch": 0.8878278319839458, "grad_norm": 24.727273083752006, "learning_rate": 1.8923372563562743e-08, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.2017, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.25, "rewards/rejected": -10.0625, "step": 12830 }, { "epoch": 0.8885198256176043, "grad_norm": 19.319257291869764, "learning_rate": 1.869356028883276e-08, "logits/chosen": -2.90625, "logits/rejected": -3.15625, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.187, "rewards/accuracies": 0.96875, "rewards/chosen": -5.21875, "rewards/margins": 4.8125, "rewards/rejected": -10.0625, "step": 12840 }, { "epoch": 0.8892118192512629, "grad_norm": 13.21642420286072, "learning_rate": 1.8465097805155732e-08, "logits/chosen": -2.90625, "logits/rejected": -3.09375, "logps/chosen": -644.0, "logps/rejected": -1136.0, "loss": 0.2192, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 4.90625, "rewards/rejected": -9.625, "step": 12850 }, { "epoch": 0.8899038128849215, "grad_norm": 14.082061084967892, "learning_rate": 1.8237986445726743e-08, "logits/chosen": -3.015625, "logits/rejected": -3.265625, "logps/chosen": -624.0, "logps/rejected": -1104.0, "loss": 0.2, "rewards/accuracies": 0.9375, "rewards/chosen": -4.59375, "rewards/margins": 4.875, "rewards/rejected": -9.4375, "step": 12860 }, { "epoch": 0.8905958065185801, "grad_norm": 17.493298597326426, "learning_rate": 1.8012227535856378e-08, "logits/chosen": -2.875, "logits/rejected": -3.1875, "logps/chosen": -652.0, "logps/rejected": -1168.0, "loss": 0.2033, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.25, "rewards/rejected": -10.0, "step": 12870 }, { "epoch": 0.8912878001522386, "grad_norm": 19.29886449671735, "learning_rate": 1.7787822392962976e-08, "logits/chosen": -2.90625, "logits/rejected": -3.109375, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.1956, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.21875, "rewards/rejected": -10.0, "step": 12880 }, { "epoch": 0.8919797937858972, "grad_norm": 13.305373847735515, "learning_rate": 1.756477232656514e-08, "logits/chosen": -2.875, "logits/rejected": -3.15625, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.1999, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 12890 }, { "epoch": 0.8926717874195558, "grad_norm": 19.846987032725078, "learning_rate": 1.7343078638273667e-08, "logits/chosen": -3.015625, "logits/rejected": -3.34375, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.2342, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.9375, "rewards/rejected": -9.9375, "step": 12900 }, { "epoch": 0.8933637810532143, "grad_norm": 25.85061734383146, "learning_rate": 1.7122742621784315e-08, "logits/chosen": -2.796875, "logits/rejected": -3.234375, "logps/chosen": -640.0, "logps/rejected": -1120.0, "loss": 0.2493, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71875, "rewards/margins": 4.96875, "rewards/rejected": -9.6875, "step": 12910 }, { "epoch": 0.8940557746868729, "grad_norm": 22.194748191204148, "learning_rate": 1.6903765562870154e-08, "logits/chosen": -2.9375, "logits/rejected": -3.234375, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.2256, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 4.875, "rewards/rejected": -9.625, "step": 12920 }, { "epoch": 0.8947477683205315, "grad_norm": 33.53538954926234, "learning_rate": 1.6686148739374016e-08, "logits/chosen": -3.046875, "logits/rejected": -3.34375, "logps/chosen": -656.0, "logps/rejected": -1152.0, "loss": 0.2247, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 5.0, "rewards/rejected": -9.875, "step": 12930 }, { "epoch": 0.8954397619541901, "grad_norm": 22.8759731658327, "learning_rate": 1.6469893421201003e-08, "logits/chosen": -2.875, "logits/rejected": -3.140625, "logps/chosen": -656.0, "logps/rejected": -1144.0, "loss": 0.2149, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 4.96875, "rewards/rejected": -9.6875, "step": 12940 }, { "epoch": 0.8961317555878486, "grad_norm": 18.65978513503336, "learning_rate": 1.6255000870311258e-08, "logits/chosen": -2.6875, "logits/rejected": -3.15625, "logps/chosen": -652.0, "logps/rejected": -1168.0, "loss": 0.1783, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.59375, "rewards/margins": 5.40625, "rewards/rejected": -10.0, "step": 12950 }, { "epoch": 0.8968237492215072, "grad_norm": 16.892541848747122, "learning_rate": 1.60414723407124e-08, "logits/chosen": -3.015625, "logits/rejected": -3.21875, "logps/chosen": -640.0, "logps/rejected": -1168.0, "loss": 0.1984, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 12960 }, { "epoch": 0.8975157428551658, "grad_norm": 29.153033107071444, "learning_rate": 1.5829309078452312e-08, "logits/chosen": -2.890625, "logits/rejected": -3.296875, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.1984, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.78125, "rewards/margins": 5.3125, "rewards/rejected": -10.125, "step": 12970 }, { "epoch": 0.8982077364888243, "grad_norm": 24.837991588833695, "learning_rate": 1.5618512321611848e-08, "logits/chosen": -3.0625, "logits/rejected": -3.296875, "logps/chosen": -660.0, "logps/rejected": -1192.0, "loss": 0.2087, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.25, "rewards/rejected": -10.25, "step": 12980 }, { "epoch": 0.8988997301224829, "grad_norm": 29.566921258934798, "learning_rate": 1.5409083300297554e-08, "logits/chosen": -2.828125, "logits/rejected": -3.28125, "logps/chosen": -640.0, "logps/rejected": -1112.0, "loss": 0.2066, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 4.78125, "rewards/rejected": -9.5, "step": 12990 }, { "epoch": 0.8995917237561415, "grad_norm": 20.129322850711503, "learning_rate": 1.5201023236634637e-08, "logits/chosen": -2.96875, "logits/rejected": -3.09375, "logps/chosen": -640.0, "logps/rejected": -1192.0, "loss": 0.2321, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.375, "rewards/rejected": -10.0625, "step": 13000 }, { "epoch": 0.8995917237561415, "eval_logits/chosen": -2.90625, "eval_logits/rejected": -3.1875, "eval_logps/chosen": -704.0, "eval_logps/rejected": -1120.0, "eval_loss": 0.22885014116764069, "eval_rewards/accuracies": 0.898054301738739, "eval_rewards/chosen": -5.15625, "eval_rewards/margins": 4.40625, "eval_rewards/rejected": -9.5625, "eval_runtime": 2940.7448, "eval_samples_per_second": 33.268, "eval_steps_per_second": 0.52, "step": 13000 }, { "epoch": 0.9002837173898001, "grad_norm": 16.4088501094456, "learning_rate": 1.499433334475969e-08, "logits/chosen": -2.90625, "logits/rejected": -3.328125, "logps/chosen": -676.0, "logps/rejected": -1200.0, "loss": 0.2177, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.40625, "rewards/rejected": -10.375, "step": 13010 }, { "epoch": 0.9009757110234586, "grad_norm": 41.85705552228657, "learning_rate": 1.4789014830813656e-08, "logits/chosen": -2.796875, "logits/rejected": -3.234375, "logps/chosen": -640.0, "logps/rejected": -1176.0, "loss": 0.2533, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.53125, "rewards/rejected": -10.125, "step": 13020 }, { "epoch": 0.9016677046571172, "grad_norm": 16.206905528947207, "learning_rate": 1.4585068892934705e-08, "logits/chosen": -2.890625, "logits/rejected": -3.203125, "logps/chosen": -632.0, "logps/rejected": -1168.0, "loss": 0.1894, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5, "rewards/margins": 5.40625, "rewards/rejected": -9.875, "step": 13030 }, { "epoch": 0.9023596982907758, "grad_norm": 38.20450737512042, "learning_rate": 1.4382496721251524e-08, "logits/chosen": -2.96875, "logits/rejected": -3.140625, "logps/chosen": -672.0, "logps/rejected": -1160.0, "loss": 0.2348, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.03125, "rewards/rejected": -10.0, "step": 13040 }, { "epoch": 0.9030516919244342, "grad_norm": 22.238050231559544, "learning_rate": 1.4181299497876026e-08, "logits/chosen": -2.953125, "logits/rejected": -3.25, "logps/chosen": -664.0, "logps/rejected": -1192.0, "loss": 0.1945, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.40625, "rewards/rejected": -10.25, "step": 13050 }, { "epoch": 0.9037436855580928, "grad_norm": 27.057454367123967, "learning_rate": 1.3981478396896562e-08, "logits/chosen": -2.78125, "logits/rejected": -3.1875, "logps/chosen": -644.0, "logps/rejected": -1200.0, "loss": 0.222, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.625, "rewards/rejected": -10.3125, "step": 13060 }, { "epoch": 0.9044356791917514, "grad_norm": 22.923088624845462, "learning_rate": 1.378303458437119e-08, "logits/chosen": -2.921875, "logits/rejected": -3.171875, "logps/chosen": -632.0, "logps/rejected": -1128.0, "loss": 0.1972, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.53125, "rewards/margins": 5.15625, "rewards/rejected": -9.6875, "step": 13070 }, { "epoch": 0.90512767282541, "grad_norm": 16.178124493839043, "learning_rate": 1.3585969218320891e-08, "logits/chosen": -2.984375, "logits/rejected": -3.234375, "logps/chosen": -664.0, "logps/rejected": -1192.0, "loss": 0.2463, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 5.15625, "rewards/rejected": -10.1875, "step": 13080 }, { "epoch": 0.9058196664590685, "grad_norm": 22.98021042794147, "learning_rate": 1.339028344872245e-08, "logits/chosen": -2.828125, "logits/rejected": -2.953125, "logps/chosen": -672.0, "logps/rejected": -1208.0, "loss": 0.2173, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.3125, "rewards/rejected": -10.375, "step": 13090 }, { "epoch": 0.9065116600927271, "grad_norm": 21.33100727007846, "learning_rate": 1.3195978417502196e-08, "logits/chosen": -2.8125, "logits/rejected": -3.21875, "logps/chosen": -660.0, "logps/rejected": -1128.0, "loss": 0.2175, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 5.0, "rewards/rejected": -9.8125, "step": 13100 }, { "epoch": 0.9072036537263857, "grad_norm": 27.612425714234888, "learning_rate": 1.3003055258529072e-08, "logits/chosen": -3.015625, "logits/rejected": -3.25, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.243, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.3125, "rewards/rejected": -10.25, "step": 13110 }, { "epoch": 0.9078956473600442, "grad_norm": 23.18497230543819, "learning_rate": 1.281151509760814e-08, "logits/chosen": -2.96875, "logits/rejected": -3.265625, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.2063, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.21875, "rewards/rejected": -10.0625, "step": 13120 }, { "epoch": 0.9085876409937028, "grad_norm": 26.2683986300507, "learning_rate": 1.2621359052473912e-08, "logits/chosen": -2.9375, "logits/rejected": -3.1875, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.2231, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 13130 }, { "epoch": 0.9092796346273614, "grad_norm": 17.008795093444, "learning_rate": 1.2432588232783896e-08, "logits/chosen": -2.890625, "logits/rejected": -3.234375, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.2064, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 13140 }, { "epoch": 0.90997162826102, "grad_norm": 24.80224170284737, "learning_rate": 1.224520374011212e-08, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -640.0, "logps/rejected": -1168.0, "loss": 0.217, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 5.40625, "rewards/rejected": -10.0, "step": 13150 }, { "epoch": 0.9106636218946785, "grad_norm": 22.163706211434505, "learning_rate": 1.2059206667942662e-08, "logits/chosen": -2.9375, "logits/rejected": -3.3125, "logps/chosen": -672.0, "logps/rejected": -1160.0, "loss": 0.1792, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.125, "rewards/rejected": -10.0625, "step": 13160 }, { "epoch": 0.9113556155283371, "grad_norm": 23.708421006229532, "learning_rate": 1.1874598101663357e-08, "logits/chosen": -2.96875, "logits/rejected": -3.125, "logps/chosen": -660.0, "logps/rejected": -1192.0, "loss": 0.2127, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 5.40625, "rewards/rejected": -10.125, "step": 13170 }, { "epoch": 0.9120476091619957, "grad_norm": 18.947907841696967, "learning_rate": 1.1691379118559247e-08, "logits/chosen": -2.859375, "logits/rejected": -3.015625, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.1952, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.25, "rewards/rejected": -10.125, "step": 13180 }, { "epoch": 0.9127396027956542, "grad_norm": 25.577372930357967, "learning_rate": 1.1509550787806632e-08, "logits/chosen": -2.90625, "logits/rejected": -3.28125, "logps/chosen": -656.0, "logps/rejected": -1152.0, "loss": 0.2162, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.15625, "rewards/rejected": -9.9375, "step": 13190 }, { "epoch": 0.9134315964293128, "grad_norm": 30.46393151266004, "learning_rate": 1.1329114170466559e-08, "logits/chosen": -2.96875, "logits/rejected": -3.25, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2333, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 13200 }, { "epoch": 0.9141235900629714, "grad_norm": 35.70981847676936, "learning_rate": 1.1150070319478677e-08, "logits/chosen": -2.96875, "logits/rejected": -3.328125, "logps/chosen": -680.0, "logps/rejected": -1120.0, "loss": 0.2408, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 4.6875, "rewards/rejected": -9.6875, "step": 13210 }, { "epoch": 0.91481558369663, "grad_norm": 19.538346314102185, "learning_rate": 1.097242027965517e-08, "logits/chosen": -2.828125, "logits/rejected": -3.03125, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.1915, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 13220 }, { "epoch": 0.9155075773302885, "grad_norm": 22.755873867223904, "learning_rate": 1.0796165087674725e-08, "logits/chosen": -2.9375, "logits/rejected": -3.015625, "logps/chosen": -696.0, "logps/rejected": -1200.0, "loss": 0.209, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.15625, "rewards/rejected": -10.1875, "step": 13230 }, { "epoch": 0.9161995709639471, "grad_norm": 18.705151250013564, "learning_rate": 1.0621305772076179e-08, "logits/chosen": -2.84375, "logits/rejected": -3.0, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.2122, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84375, "rewards/margins": 4.84375, "rewards/rejected": -9.6875, "step": 13240 }, { "epoch": 0.9168915645976057, "grad_norm": 17.324316532743655, "learning_rate": 1.0447843353252884e-08, "logits/chosen": -3.0, "logits/rejected": -3.296875, "logps/chosen": -656.0, "logps/rejected": -1128.0, "loss": 0.2189, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.84375, "rewards/rejected": -9.8125, "step": 13250 }, { "epoch": 0.9175835582312643, "grad_norm": 24.09649414025475, "learning_rate": 1.0275778843446492e-08, "logits/chosen": -2.953125, "logits/rejected": -3.125, "logps/chosen": -608.0, "logps/rejected": -1160.0, "loss": 0.1822, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.53125, "rewards/margins": 5.5625, "rewards/rejected": -10.0625, "step": 13260 }, { "epoch": 0.9182755518649228, "grad_norm": 12.672548413776193, "learning_rate": 1.0105113246741209e-08, "logits/chosen": -2.9375, "logits/rejected": -3.15625, "logps/chosen": -632.0, "logps/rejected": -1168.0, "loss": 0.1674, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.5625, "rewards/margins": 5.4375, "rewards/rejected": -10.0, "step": 13270 }, { "epoch": 0.9189675454985814, "grad_norm": 31.812334910811394, "learning_rate": 9.935847559057825e-09, "logits/chosen": -2.8125, "logits/rejected": -3.03125, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.1967, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 13280 }, { "epoch": 0.91965953913224, "grad_norm": 25.58748087965262, "learning_rate": 9.767982768147943e-09, "logits/chosen": -2.875, "logits/rejected": -3.28125, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2575, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.75, "rewards/margins": 5.21875, "rewards/rejected": -10.0, "step": 13290 }, { "epoch": 0.9203515327658985, "grad_norm": 23.389978057474817, "learning_rate": 9.601519853588285e-09, "logits/chosen": -2.828125, "logits/rejected": -3.046875, "logps/chosen": -640.0, "logps/rejected": -1176.0, "loss": 0.2277, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.75, "rewards/margins": 5.375, "rewards/rejected": -10.125, "step": 13300 }, { "epoch": 0.9210435263995571, "grad_norm": 14.597156318759874, "learning_rate": 9.43645978677482e-09, "logits/chosen": -2.953125, "logits/rejected": -3.28125, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.1812, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 5.15625, "rewards/rejected": -9.9375, "step": 13310 }, { "epoch": 0.9217355200332157, "grad_norm": 21.685373259185045, "learning_rate": 9.272803530917278e-09, "logits/chosen": -2.75, "logits/rejected": -3.203125, "logps/chosen": -680.0, "logps/rejected": -1160.0, "loss": 0.205, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 13320 }, { "epoch": 0.9224275136668743, "grad_norm": 32.67086272851641, "learning_rate": 9.110552041033281e-09, "logits/chosen": -2.8125, "logits/rejected": -3.171875, "logps/chosen": -644.0, "logps/rejected": -1208.0, "loss": 0.2128, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.59375, "rewards/margins": 5.71875, "rewards/rejected": -10.3125, "step": 13330 }, { "epoch": 0.9231195073005328, "grad_norm": 18.5946326293403, "learning_rate": 8.949706263943119e-09, "logits/chosen": -2.96875, "logits/rejected": -3.359375, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2331, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 5.3125, "rewards/rejected": -10.0, "step": 13340 }, { "epoch": 0.9238115009341914, "grad_norm": 20.20852582036812, "learning_rate": 8.790267138263974e-09, "logits/chosen": -2.921875, "logits/rejected": -3.34375, "logps/chosen": -620.0, "logps/rejected": -1128.0, "loss": 0.2029, "rewards/accuracies": 0.96875, "rewards/chosen": -4.4375, "rewards/margins": 5.21875, "rewards/rejected": -9.6875, "step": 13350 }, { "epoch": 0.92450349456785, "grad_norm": 16.74842258766607, "learning_rate": 8.632235594404346e-09, "logits/chosen": -2.90625, "logits/rejected": -3.171875, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2053, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.21875, "rewards/rejected": -9.875, "step": 13360 }, { "epoch": 0.9251954882015085, "grad_norm": 12.672501362738537, "learning_rate": 8.475612554558947e-09, "logits/chosen": -2.8125, "logits/rejected": -3.171875, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.1745, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.375, "rewards/rejected": -10.25, "step": 13370 }, { "epoch": 0.9258874818351671, "grad_norm": 12.67465264162701, "learning_rate": 8.320398932703144e-09, "logits/chosen": -2.953125, "logits/rejected": -3.234375, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.1762, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.28125, "rewards/rejected": -10.125, "step": 13380 }, { "epoch": 0.9265794754688257, "grad_norm": 31.99872961443921, "learning_rate": 8.166595634587548e-09, "logits/chosen": -2.859375, "logits/rejected": -3.28125, "logps/chosen": -684.0, "logps/rejected": -1160.0, "loss": 0.2328, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 5.0, "rewards/rejected": -10.125, "step": 13390 }, { "epoch": 0.9272714691024843, "grad_norm": 12.732168802799245, "learning_rate": 8.014203557732858e-09, "logits/chosen": -3.015625, "logits/rejected": -3.390625, "logps/chosen": -632.0, "logps/rejected": -1152.0, "loss": 0.2199, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.25, "rewards/rejected": -10.125, "step": 13400 }, { "epoch": 0.9279634627361428, "grad_norm": 24.882468250424296, "learning_rate": 7.86322359142455e-09, "logits/chosen": -2.9375, "logits/rejected": -3.265625, "logps/chosen": -652.0, "logps/rejected": -1184.0, "loss": 0.2344, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.6875, "rewards/margins": 5.46875, "rewards/rejected": -10.125, "step": 13410 }, { "epoch": 0.9286554563698014, "grad_norm": 33.81494957056149, "learning_rate": 7.713656616707753e-09, "logits/chosen": -2.9375, "logits/rejected": -3.359375, "logps/chosen": -640.0, "logps/rejected": -1120.0, "loss": 0.2393, "rewards/accuracies": 0.9375, "rewards/chosen": -4.6875, "rewards/margins": 5.0625, "rewards/rejected": -9.75, "step": 13420 }, { "epoch": 0.92934745000346, "grad_norm": 14.113705066166471, "learning_rate": 7.565503506382076e-09, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -668.0, "logps/rejected": -1136.0, "loss": 0.2324, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 4.90625, "rewards/rejected": -9.8125, "step": 13430 }, { "epoch": 0.9300394436371185, "grad_norm": 21.965899523758203, "learning_rate": 7.418765124996423e-09, "logits/chosen": -2.859375, "logits/rejected": -3.34375, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2221, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 13440 }, { "epoch": 0.9307314372707771, "grad_norm": 26.860881303965918, "learning_rate": 7.273442328844137e-09, "logits/chosen": -2.984375, "logits/rejected": -3.25, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.2314, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.875, "rewards/margins": 5.03125, "rewards/rejected": -9.9375, "step": 13450 }, { "epoch": 0.9314234309044357, "grad_norm": 34.3667502318673, "learning_rate": 7.129535965957861e-09, "logits/chosen": -2.875, "logits/rejected": -3.34375, "logps/chosen": -692.0, "logps/rejected": -1208.0, "loss": 0.2521, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 13460 }, { "epoch": 0.9321154245380943, "grad_norm": 26.45729972931401, "learning_rate": 6.987046876104602e-09, "logits/chosen": -2.9375, "logits/rejected": -3.015625, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.2386, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.40625, "rewards/rejected": -10.3125, "step": 13470 }, { "epoch": 0.9328074181717528, "grad_norm": 21.456273976285615, "learning_rate": 6.845975890780842e-09, "logits/chosen": -2.875, "logits/rejected": -3.125, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.2153, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.90625, "rewards/margins": 5.5, "rewards/rejected": -10.375, "step": 13480 }, { "epoch": 0.9334994118054114, "grad_norm": 20.479507018986055, "learning_rate": 6.706323833207794e-09, "logits/chosen": -2.984375, "logits/rejected": -3.171875, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.2335, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 4.8125, "rewards/rejected": -9.875, "step": 13490 }, { "epoch": 0.93419140543907, "grad_norm": 21.177406509271595, "learning_rate": 6.568091518326463e-09, "logits/chosen": -2.90625, "logits/rejected": -3.15625, "logps/chosen": -708.0, "logps/rejected": -1192.0, "loss": 0.2126, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 5.15625, "rewards/rejected": -10.3125, "step": 13500 }, { "epoch": 0.9348833990727285, "grad_norm": 29.71812528877069, "learning_rate": 6.431279752792812e-09, "logits/chosen": -2.90625, "logits/rejected": -3.15625, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.2604, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 13510 }, { "epoch": 0.9355753927063871, "grad_norm": 26.54740428519338, "learning_rate": 6.295889334973325e-09, "logits/chosen": -2.859375, "logits/rejected": -3.140625, "logps/chosen": -656.0, "logps/rejected": -1112.0, "loss": 0.2487, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.9375, "rewards/rejected": -9.75, "step": 13520 }, { "epoch": 0.9362673863400457, "grad_norm": 19.456589678421572, "learning_rate": 6.161921054940178e-09, "logits/chosen": -2.859375, "logits/rejected": -2.984375, "logps/chosen": -648.0, "logps/rejected": -1208.0, "loss": 0.2106, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.59375, "rewards/margins": 5.75, "rewards/rejected": -10.375, "step": 13530 }, { "epoch": 0.9369593799737043, "grad_norm": 16.85758415390935, "learning_rate": 6.029375694466571e-09, "logits/chosen": -2.90625, "logits/rejected": -3.3125, "logps/chosen": -676.0, "logps/rejected": -1152.0, "loss": 0.2158, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.0, "rewards/rejected": -9.75, "step": 13540 }, { "epoch": 0.9376513736073628, "grad_norm": 21.440804679568593, "learning_rate": 5.8982540270222924e-09, "logits/chosen": -2.984375, "logits/rejected": -3.25, "logps/chosen": -652.0, "logps/rejected": -1200.0, "loss": 0.2293, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.53125, "rewards/rejected": -10.3125, "step": 13550 }, { "epoch": 0.9383433672410214, "grad_norm": 27.87553174396493, "learning_rate": 5.768556817769166e-09, "logits/chosen": -2.96875, "logits/rejected": -3.25, "logps/chosen": -652.0, "logps/rejected": -1120.0, "loss": 0.2148, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 4.875, "rewards/rejected": -9.6875, "step": 13560 }, { "epoch": 0.93903536087468, "grad_norm": 25.972927831077556, "learning_rate": 5.640284823556551e-09, "logits/chosen": -2.84375, "logits/rejected": -3.140625, "logps/chosen": -660.0, "logps/rejected": -1152.0, "loss": 0.1978, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.15625, "rewards/rejected": -9.875, "step": 13570 }, { "epoch": 0.9397273545083386, "grad_norm": 23.150813101248325, "learning_rate": 5.513438792916964e-09, "logits/chosen": -2.9375, "logits/rejected": -3.28125, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2078, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.65625, "rewards/margins": 5.28125, "rewards/rejected": -9.9375, "step": 13580 }, { "epoch": 0.9404193481419971, "grad_norm": 15.439640547431503, "learning_rate": 5.388019466061683e-09, "logits/chosen": -2.765625, "logits/rejected": -3.09375, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2302, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 13590 }, { "epoch": 0.9411113417756557, "grad_norm": 31.648213724039575, "learning_rate": 5.2640275748764566e-09, "logits/chosen": -2.90625, "logits/rejected": -3.453125, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.2445, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.59375, "rewards/margins": 5.25, "rewards/rejected": -9.875, "step": 13600 }, { "epoch": 0.9418033354093143, "grad_norm": 14.61519849628782, "learning_rate": 5.141463842917221e-09, "logits/chosen": -2.9375, "logits/rejected": -3.390625, "logps/chosen": -628.0, "logps/rejected": -1144.0, "loss": 0.2153, "rewards/accuracies": 0.96875, "rewards/chosen": -4.46875, "rewards/margins": 5.4375, "rewards/rejected": -9.875, "step": 13610 }, { "epoch": 0.9424953290429728, "grad_norm": 29.513322248480335, "learning_rate": 5.020328985405858e-09, "logits/chosen": -2.828125, "logits/rejected": -3.328125, "logps/chosen": -676.0, "logps/rejected": -1160.0, "loss": 0.2403, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.96875, "rewards/rejected": -9.875, "step": 13620 }, { "epoch": 0.9431873226766314, "grad_norm": 33.889649396654555, "learning_rate": 4.900623709226004e-09, "logits/chosen": -2.828125, "logits/rejected": -3.171875, "logps/chosen": -656.0, "logps/rejected": -1152.0, "loss": 0.22, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 13630 }, { "epoch": 0.94387931631029, "grad_norm": 27.732054043711308, "learning_rate": 4.782348712919049e-09, "logits/chosen": -2.90625, "logits/rejected": -3.109375, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.2489, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.125, "rewards/rejected": -9.9375, "step": 13640 }, { "epoch": 0.9445713099439486, "grad_norm": 26.843929901506073, "learning_rate": 4.66550468667995e-09, "logits/chosen": -3.0, "logits/rejected": -3.34375, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.2113, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.3125, "rewards/rejected": -10.0, "step": 13650 }, { "epoch": 0.9452633035776071, "grad_norm": 19.92357009008212, "learning_rate": 4.550092312353121e-09, "logits/chosen": -2.9375, "logits/rejected": -3.21875, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.1967, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.75, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 13660 }, { "epoch": 0.9459552972112657, "grad_norm": 21.073138759940804, "learning_rate": 4.4361122634286595e-09, "logits/chosen": -2.953125, "logits/rejected": -3.15625, "logps/chosen": -644.0, "logps/rejected": -1152.0, "loss": 0.2311, "rewards/accuracies": 0.9375, "rewards/chosen": -4.6875, "rewards/margins": 5.09375, "rewards/rejected": -9.75, "step": 13670 }, { "epoch": 0.9466472908449243, "grad_norm": 20.85345814259787, "learning_rate": 4.323565205038293e-09, "logits/chosen": -2.890625, "logits/rejected": -3.15625, "logps/chosen": -676.0, "logps/rejected": -1160.0, "loss": 0.2061, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.9375, "rewards/rejected": -9.9375, "step": 13680 }, { "epoch": 0.9473392844785828, "grad_norm": 31.63532726209082, "learning_rate": 4.2124517939515495e-09, "logits/chosen": -2.875, "logits/rejected": -3.0, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2437, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.0625, "rewards/rejected": -9.8125, "step": 13690 }, { "epoch": 0.9480312781122414, "grad_norm": 21.94957080272025, "learning_rate": 4.102772678571814e-09, "logits/chosen": -2.984375, "logits/rejected": -3.1875, "logps/chosen": -640.0, "logps/rejected": -1160.0, "loss": 0.2316, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 13700 }, { "epoch": 0.9487232717459, "grad_norm": 18.177705128712976, "learning_rate": 3.994528498932642e-09, "logits/chosen": -3.015625, "logits/rejected": -3.234375, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.2093, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.3125, "rewards/rejected": -10.25, "step": 13710 }, { "epoch": 0.9494152653795586, "grad_norm": 21.783719375762633, "learning_rate": 3.887719886694091e-09, "logits/chosen": -2.859375, "logits/rejected": -3.15625, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.2047, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 5.40625, "rewards/rejected": -10.0625, "step": 13720 }, { "epoch": 0.9501072590132171, "grad_norm": 18.172340029498674, "learning_rate": 3.782347465138836e-09, "logits/chosen": -2.90625, "logits/rejected": -3.25, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.2095, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.28125, "rewards/rejected": -10.0, "step": 13730 }, { "epoch": 0.9507992526468757, "grad_norm": 19.492690901670567, "learning_rate": 3.6784118491687555e-09, "logits/chosen": -2.859375, "logits/rejected": -3.296875, "logps/chosen": -684.0, "logps/rejected": -1144.0, "loss": 0.2291, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.90625, "rewards/rejected": -9.9375, "step": 13740 }, { "epoch": 0.9514912462805342, "grad_norm": 31.468084425105967, "learning_rate": 3.5759136453011317e-09, "logits/chosen": -2.96875, "logits/rejected": -3.296875, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.2129, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.21875, "rewards/rejected": -10.125, "step": 13750 }, { "epoch": 0.9521832399141927, "grad_norm": 21.493669708211172, "learning_rate": 3.4748534516652884e-09, "logits/chosen": -2.96875, "logits/rejected": -3.265625, "logps/chosen": -664.0, "logps/rejected": -1176.0, "loss": 0.1819, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 13760 }, { "epoch": 0.9528752335478513, "grad_norm": 31.118086906715853, "learning_rate": 3.3752318579989837e-09, "logits/chosen": -2.796875, "logits/rejected": -3.21875, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.2322, "rewards/accuracies": 0.9375, "rewards/chosen": -4.65625, "rewards/margins": 5.4375, "rewards/rejected": -10.0625, "step": 13770 }, { "epoch": 0.9535672271815099, "grad_norm": 27.765049315578594, "learning_rate": 3.2770494456450527e-09, "logits/chosen": -3.0, "logits/rejected": -3.25, "logps/chosen": -660.0, "logps/rejected": -1152.0, "loss": 0.2629, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 13780 }, { "epoch": 0.9542592208151685, "grad_norm": 25.61075202895016, "learning_rate": 3.1803067875479374e-09, "logits/chosen": -2.921875, "logits/rejected": -3.15625, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.1888, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 5.125, "rewards/rejected": -9.875, "step": 13790 }, { "epoch": 0.954951214448827, "grad_norm": 24.138716185957264, "learning_rate": 3.0850044482503833e-09, "logits/chosen": -2.859375, "logits/rejected": -3.40625, "logps/chosen": -656.0, "logps/rejected": -1144.0, "loss": 0.196, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.25, "rewards/rejected": -9.9375, "step": 13800 }, { "epoch": 0.9556432080824856, "grad_norm": 11.725133358665525, "learning_rate": 2.991142983890166e-09, "logits/chosen": -2.9375, "logits/rejected": -3.171875, "logps/chosen": -672.0, "logps/rejected": -1208.0, "loss": 0.1968, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.46875, "rewards/rejected": -10.3125, "step": 13810 }, { "epoch": 0.9563352017161442, "grad_norm": 21.632806737521467, "learning_rate": 2.8987229421967853e-09, "logits/chosen": -2.96875, "logits/rejected": -3.359375, "logps/chosen": -656.0, "logps/rejected": -1128.0, "loss": 0.2332, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 4.84375, "rewards/rejected": -9.6875, "step": 13820 }, { "epoch": 0.9570271953498027, "grad_norm": 27.044038351567853, "learning_rate": 2.8077448624883315e-09, "logits/chosen": -2.84375, "logits/rejected": -3.28125, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2462, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.40625, "rewards/rejected": -10.1875, "step": 13830 }, { "epoch": 0.9577191889834613, "grad_norm": 19.72517354461491, "learning_rate": 2.7182092756683206e-09, "logits/chosen": -2.875, "logits/rejected": -3.28125, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.1832, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.34375, "rewards/rejected": -10.0625, "step": 13840 }, { "epoch": 0.9584111826171199, "grad_norm": 19.418562719077517, "learning_rate": 2.630116704222557e-09, "logits/chosen": -3.0, "logits/rejected": -3.234375, "logps/chosen": -632.0, "logps/rejected": -1192.0, "loss": 0.1865, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.5625, "rewards/margins": 5.8125, "rewards/rejected": -10.375, "step": 13850 }, { "epoch": 0.9591031762507785, "grad_norm": 29.219228126772144, "learning_rate": 2.543467662216081e-09, "logits/chosen": -3.015625, "logits/rejected": -3.375, "logps/chosen": -620.0, "logps/rejected": -1144.0, "loss": 0.2112, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.5, "rewards/margins": 5.34375, "rewards/rejected": -9.875, "step": 13860 }, { "epoch": 0.959795169884437, "grad_norm": 22.90794945644203, "learning_rate": 2.4582626552903107e-09, "logits/chosen": -2.90625, "logits/rejected": -3.125, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.2179, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.84375, "rewards/rejected": -9.8125, "step": 13870 }, { "epoch": 0.9604871635180956, "grad_norm": 22.690895542997097, "learning_rate": 2.3745021806599042e-09, "logits/chosen": -2.9375, "logits/rejected": -2.9375, "logps/chosen": -672.0, "logps/rejected": -1200.0, "loss": 0.2078, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 13880 }, { "epoch": 0.9611791571517542, "grad_norm": 24.378299525234773, "learning_rate": 2.292186727109929e-09, "logits/chosen": -2.9375, "logits/rejected": -3.234375, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2193, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.65625, "rewards/margins": 5.40625, "rewards/rejected": -10.0625, "step": 13890 }, { "epoch": 0.9618711507854127, "grad_norm": 17.71932026247264, "learning_rate": 2.2113167749930885e-09, "logits/chosen": -2.8125, "logits/rejected": -3.09375, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.1874, "rewards/accuracies": 0.9375, "rewards/chosen": -4.78125, "rewards/margins": 4.96875, "rewards/rejected": -9.75, "step": 13900 }, { "epoch": 0.9625631444190713, "grad_norm": 16.36165384615286, "learning_rate": 2.1318927962267476e-09, "logits/chosen": -2.875, "logits/rejected": -3.21875, "logps/chosen": -644.0, "logps/rejected": -1144.0, "loss": 0.1997, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 5.15625, "rewards/rejected": -9.8125, "step": 13910 }, { "epoch": 0.9632551380527299, "grad_norm": 25.363991936439433, "learning_rate": 2.053915254290356e-09, "logits/chosen": -2.90625, "logits/rejected": -3.1875, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.2277, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.375, "rewards/rejected": -10.0625, "step": 13920 }, { "epoch": 0.9639471316863885, "grad_norm": 18.719358709591013, "learning_rate": 1.9773846042226427e-09, "logits/chosen": -2.96875, "logits/rejected": -3.296875, "logps/chosen": -624.0, "logps/rejected": -1120.0, "loss": 0.2075, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.0625, "rewards/rejected": -9.625, "step": 13930 }, { "epoch": 0.964639125320047, "grad_norm": 25.043807314803384, "learning_rate": 1.9023012926189796e-09, "logits/chosen": -3.0, "logits/rejected": -3.171875, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2147, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 5.03125, "rewards/rejected": -10.0, "step": 13940 }, { "epoch": 0.9653311189537056, "grad_norm": 23.928431616706195, "learning_rate": 1.8286657576288001e-09, "logits/chosen": -2.921875, "logits/rejected": -3.25, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.2536, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.15625, "rewards/rejected": -9.9375, "step": 13950 }, { "epoch": 0.9660231125873642, "grad_norm": 21.733217221295785, "learning_rate": 1.7564784289529899e-09, "logits/chosen": -2.9375, "logits/rejected": -3.28125, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.2195, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.875, "rewards/margins": 5.0, "rewards/rejected": -9.875, "step": 13960 }, { "epoch": 0.9667151062210227, "grad_norm": 19.350837474917086, "learning_rate": 1.6857397278414454e-09, "logits/chosen": -2.921875, "logits/rejected": -3.09375, "logps/chosen": -648.0, "logps/rejected": -1184.0, "loss": 0.189, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 13970 }, { "epoch": 0.9674070998546813, "grad_norm": 19.24654857711931, "learning_rate": 1.6164500670905467e-09, "logits/chosen": -2.921875, "logits/rejected": -3.03125, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2279, "rewards/accuracies": 0.9375, "rewards/chosen": -4.90625, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 13980 }, { "epoch": 0.9680990934883399, "grad_norm": 24.5385609012356, "learning_rate": 1.5486098510408553e-09, "logits/chosen": -2.875, "logits/rejected": -3.265625, "logps/chosen": -656.0, "logps/rejected": -1112.0, "loss": 0.2477, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 4.78125, "rewards/rejected": -9.625, "step": 13990 }, { "epoch": 0.9687910871219985, "grad_norm": 23.675257632114015, "learning_rate": 1.4822194755745864e-09, "logits/chosen": -2.890625, "logits/rejected": -3.140625, "logps/chosen": -648.0, "logps/rejected": -1192.0, "loss": 0.2201, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.71875, "rewards/margins": 5.46875, "rewards/rejected": -10.1875, "step": 14000 }, { "epoch": 0.9687910871219985, "eval_logits/chosen": -2.90625, "eval_logits/rejected": -3.171875, "eval_logps/chosen": -696.0, "eval_logps/rejected": -1112.0, "eval_loss": 0.2286481112241745, "eval_rewards/accuracies": 0.8977272510528564, "eval_rewards/chosen": -5.09375, "eval_rewards/margins": 4.375, "eval_rewards/rejected": -9.4375, "eval_runtime": 2938.8888, "eval_samples_per_second": 33.289, "eval_steps_per_second": 0.52, "step": 14000 }, { "epoch": 0.969483080755657, "grad_norm": 16.54330712119073, "learning_rate": 1.4172793281134465e-09, "logits/chosen": -2.96875, "logits/rejected": -3.28125, "logps/chosen": -624.0, "logps/rejected": -1136.0, "loss": 0.2379, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.15625, "rewards/rejected": -9.875, "step": 14010 }, { "epoch": 0.9701750743893156, "grad_norm": 28.39091416087011, "learning_rate": 1.3537897876163273e-09, "logits/chosen": -3.0, "logits/rejected": -3.359375, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2502, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 5.15625, "rewards/rejected": -10.25, "step": 14020 }, { "epoch": 0.9708670680229742, "grad_norm": 30.474534894401195, "learning_rate": 1.2917512245770867e-09, "logits/chosen": -2.921875, "logits/rejected": -3.0, "logps/chosen": -660.0, "logps/rejected": -1216.0, "loss": 0.2178, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.59375, "rewards/rejected": -10.4375, "step": 14030 }, { "epoch": 0.9715590616566327, "grad_norm": 33.018969690454824, "learning_rate": 1.2311640010223834e-09, "logits/chosen": -2.84375, "logits/rejected": -3.125, "logps/chosen": -680.0, "logps/rejected": -1160.0, "loss": 0.2032, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.875, "rewards/rejected": -9.8125, "step": 14040 }, { "epoch": 0.9722510552902913, "grad_norm": 22.932721617814153, "learning_rate": 1.1720284705095674e-09, "logits/chosen": -2.921875, "logits/rejected": -3.03125, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2161, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 4.96875, "rewards/rejected": -9.9375, "step": 14050 }, { "epoch": 0.9729430489239499, "grad_norm": 27.6376424914995, "learning_rate": 1.1143449781245983e-09, "logits/chosen": -2.84375, "logits/rejected": -3.09375, "logps/chosen": -684.0, "logps/rejected": -1152.0, "loss": 0.2107, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.0, "rewards/rejected": -9.8125, "step": 14060 }, { "epoch": 0.9736350425576085, "grad_norm": 20.405239500103935, "learning_rate": 1.0581138604801033e-09, "logits/chosen": -3.0, "logits/rejected": -3.546875, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.1959, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 5.40625, "rewards/rejected": -10.1875, "step": 14070 }, { "epoch": 0.974327036191267, "grad_norm": 24.887579550349272, "learning_rate": 1.0033354457133213e-09, "logits/chosen": -2.9375, "logits/rejected": -3.046875, "logps/chosen": -652.0, "logps/rejected": -1232.0, "loss": 0.2097, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.71875, "rewards/rejected": -10.625, "step": 14080 }, { "epoch": 0.9750190298249256, "grad_norm": 28.54545461621497, "learning_rate": 9.50010053484246e-10, "logits/chosen": -2.953125, "logits/rejected": -3.3125, "logps/chosen": -684.0, "logps/rejected": -1160.0, "loss": 0.1945, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 5.03125, "rewards/rejected": -10.0625, "step": 14090 }, { "epoch": 0.9757110234585842, "grad_norm": 23.320285275373074, "learning_rate": 8.981379949737356e-10, "logits/chosen": -2.890625, "logits/rejected": -3.15625, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.1953, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.125, "rewards/rejected": -10.0625, "step": 14100 }, { "epoch": 0.9764030170922428, "grad_norm": 14.928200097436074, "learning_rate": 8.477195728817387e-10, "logits/chosen": -2.890625, "logits/rejected": -3.109375, "logps/chosen": -644.0, "logps/rejected": -1176.0, "loss": 0.1689, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.40625, "rewards/rejected": -10.0625, "step": 14110 }, { "epoch": 0.9770950107259013, "grad_norm": 19.568338479291253, "learning_rate": 7.987550814254063e-10, "logits/chosen": -2.890625, "logits/rejected": -3.203125, "logps/chosen": -672.0, "logps/rejected": -1176.0, "loss": 0.221, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.375, "rewards/rejected": -10.1875, "step": 14120 }, { "epoch": 0.9777870043595599, "grad_norm": 17.340936435160145, "learning_rate": 7.512448063375366e-10, "logits/chosen": -2.953125, "logits/rejected": -3.125, "logps/chosen": -688.0, "logps/rejected": -1168.0, "loss": 0.2179, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.9375, "rewards/rejected": -9.9375, "step": 14130 }, { "epoch": 0.9784789979932185, "grad_norm": 29.415782901742865, "learning_rate": 7.051890248648829e-10, "logits/chosen": -2.8125, "logits/rejected": -3.109375, "logps/chosen": -660.0, "logps/rejected": -1128.0, "loss": 0.216, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.78125, "rewards/rejected": -9.5625, "step": 14140 }, { "epoch": 0.979170991626877, "grad_norm": 25.824228769167444, "learning_rate": 6.605880057663215e-10, "logits/chosen": -2.984375, "logits/rejected": -3.0625, "logps/chosen": -636.0, "logps/rejected": -1200.0, "loss": 0.2364, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.5625, "rewards/rejected": -10.25, "step": 14150 }, { "epoch": 0.9798629852605356, "grad_norm": 26.48511908214771, "learning_rate": 6.174420093116306e-10, "logits/chosen": -2.9375, "logits/rejected": -3.203125, "logps/chosen": -652.0, "logps/rejected": -1168.0, "loss": 0.2182, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.28125, "rewards/rejected": -10.0, "step": 14160 }, { "epoch": 0.9805549788941942, "grad_norm": 23.597164226680565, "learning_rate": 5.757512872796577e-10, "logits/chosen": -2.84375, "logits/rejected": -3.0625, "logps/chosen": -640.0, "logps/rejected": -1232.0, "loss": 0.1805, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.65625, "rewards/margins": 5.8125, "rewards/rejected": -10.4375, "step": 14170 }, { "epoch": 0.9812469725278528, "grad_norm": 34.74859186772393, "learning_rate": 5.355160829570438e-10, "logits/chosen": -3.015625, "logits/rejected": -3.203125, "logps/chosen": -672.0, "logps/rejected": -1208.0, "loss": 0.2303, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 5.53125, "rewards/rejected": -10.625, "step": 14180 }, { "epoch": 0.9819389661615113, "grad_norm": 30.252025488416567, "learning_rate": 4.967366311367239e-10, "logits/chosen": -3.046875, "logits/rejected": -3.3125, "logps/chosen": -628.0, "logps/rejected": -1176.0, "loss": 0.2351, "rewards/accuracies": 0.96875, "rewards/chosen": -4.46875, "rewards/margins": 5.5625, "rewards/rejected": -10.0, "step": 14190 }, { "epoch": 0.9826309597951699, "grad_norm": 12.680629724762007, "learning_rate": 4.594131581165672e-10, "logits/chosen": -2.65625, "logits/rejected": -2.921875, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.1748, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.6875, "rewards/margins": 5.21875, "rewards/rejected": -9.875, "step": 14200 }, { "epoch": 0.9833229534288285, "grad_norm": 11.118529429109094, "learning_rate": 4.2354588169812854e-10, "logits/chosen": -2.859375, "logits/rejected": -3.078125, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.2025, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.65625, "rewards/margins": 5.28125, "rewards/rejected": -9.9375, "step": 14210 }, { "epoch": 0.984014947062487, "grad_norm": 21.547766462114865, "learning_rate": 3.891350111852043e-10, "logits/chosen": -2.859375, "logits/rejected": -3.203125, "logps/chosen": -664.0, "logps/rejected": -1120.0, "loss": 0.2023, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.75, "rewards/rejected": -9.6875, "step": 14220 }, { "epoch": 0.9847069406961456, "grad_norm": 18.0124616451563, "learning_rate": 3.5618074738277824e-10, "logits/chosen": -2.8125, "logits/rejected": -3.21875, "logps/chosen": -680.0, "logps/rejected": -1136.0, "loss": 0.1913, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 4.90625, "rewards/rejected": -9.9375, "step": 14230 }, { "epoch": 0.9853989343298042, "grad_norm": 24.942452467617375, "learning_rate": 3.246832825958279e-10, "logits/chosen": -2.765625, "logits/rejected": -3.046875, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2326, "rewards/accuracies": 0.96875, "rewards/chosen": -4.84375, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 14240 }, { "epoch": 0.9860909279634628, "grad_norm": 26.909161279470506, "learning_rate": 2.9464280062807544e-10, "logits/chosen": -2.828125, "logits/rejected": -3.0625, "logps/chosen": -656.0, "logps/rejected": -1152.0, "loss": 0.215, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 5.03125, "rewards/rejected": -9.9375, "step": 14250 }, { "epoch": 0.9867829215971213, "grad_norm": 25.112040006003312, "learning_rate": 2.6605947678109977e-10, "logits/chosen": -3.015625, "logits/rejected": -3.203125, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2103, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 5.0625, "rewards/rejected": -9.8125, "step": 14260 }, { "epoch": 0.9874749152307799, "grad_norm": 27.73415928875351, "learning_rate": 2.3893347785314264e-10, "logits/chosen": -2.8125, "logits/rejected": -3.03125, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.1844, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.34375, "rewards/rejected": -10.0625, "step": 14270 }, { "epoch": 0.9881669088644385, "grad_norm": 23.30012925760434, "learning_rate": 2.1326496213824874e-10, "logits/chosen": -2.90625, "logits/rejected": -3.03125, "logps/chosen": -632.0, "logps/rejected": -1160.0, "loss": 0.2246, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.59375, "rewards/margins": 5.34375, "rewards/rejected": -9.9375, "step": 14280 }, { "epoch": 0.988858902498097, "grad_norm": 17.117348893930657, "learning_rate": 1.8905407942532148e-10, "logits/chosen": -2.890625, "logits/rejected": -3.21875, "logps/chosen": -632.0, "logps/rejected": -1144.0, "loss": 0.2233, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.625, "rewards/margins": 5.1875, "rewards/rejected": -9.8125, "step": 14290 }, { "epoch": 0.9895508961317556, "grad_norm": 19.576652609068237, "learning_rate": 1.6630097099717966e-10, "logits/chosen": -3.03125, "logits/rejected": -3.484375, "logps/chosen": -624.0, "logps/rejected": -1136.0, "loss": 0.211, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.53125, "rewards/margins": 5.28125, "rewards/rejected": -9.8125, "step": 14300 }, { "epoch": 0.9902428897654142, "grad_norm": 12.737258785034287, "learning_rate": 1.450057696298357e-10, "logits/chosen": -2.953125, "logits/rejected": -3.296875, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.1941, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 14310 }, { "epoch": 0.9909348833990728, "grad_norm": 23.238490493847465, "learning_rate": 1.2516859959166292e-10, "logits/chosen": -2.96875, "logits/rejected": -3.234375, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2402, "rewards/accuracies": 0.9375, "rewards/chosen": -4.8125, "rewards/margins": 5.21875, "rewards/rejected": -10.0625, "step": 14320 }, { "epoch": 0.9916268770327313, "grad_norm": 16.151397892602287, "learning_rate": 1.0678957664264631e-10, "logits/chosen": -2.859375, "logits/rejected": -3.234375, "logps/chosen": -712.0, "logps/rejected": -1144.0, "loss": 0.2216, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 4.625, "rewards/rejected": -9.8125, "step": 14330 }, { "epoch": 0.9923188706663899, "grad_norm": 21.600217063496675, "learning_rate": 8.986880803374397e-11, "logits/chosen": -2.921875, "logits/rejected": -3.296875, "logps/chosen": -644.0, "logps/rejected": -1152.0, "loss": 0.2009, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.25, "rewards/rejected": -9.9375, "step": 14340 }, { "epoch": 0.9930108643000485, "grad_norm": 24.27715855373829, "learning_rate": 7.440639250630432e-11, "logits/chosen": -2.90625, "logits/rejected": -3.28125, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.1959, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 4.90625, "rewards/rejected": -9.75, "step": 14350 }, { "epoch": 0.993702857933707, "grad_norm": 19.72213733718732, "learning_rate": 6.040242029139997e-11, "logits/chosen": -2.9375, "logits/rejected": -3.109375, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.1941, "rewards/accuracies": 0.96875, "rewards/chosen": -4.84375, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 14360 }, { "epoch": 0.9943948515673656, "grad_norm": 28.484852262480267, "learning_rate": 4.7856973109328126e-11, "logits/chosen": -2.9375, "logits/rejected": -3.265625, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.2768, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.8125, "rewards/rejected": -9.8125, "step": 14370 }, { "epoch": 0.9950868452010242, "grad_norm": 21.036348049486083, "learning_rate": 3.677012416919423e-11, "logits/chosen": -3.046875, "logits/rejected": -3.265625, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2222, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 5.03125, "rewards/rejected": -9.9375, "step": 14380 }, { "epoch": 0.9957788388346828, "grad_norm": 30.285207296023255, "learning_rate": 2.714193816841237e-11, "logits/chosen": -2.828125, "logits/rejected": -2.9375, "logps/chosen": -668.0, "logps/rejected": -1192.0, "loss": 0.2302, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.28125, "rewards/rejected": -10.0625, "step": 14390 }, { "epoch": 0.9964708324683413, "grad_norm": 27.785622883839256, "learning_rate": 1.897247129234447e-11, "logits/chosen": -2.859375, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1184.0, "loss": 0.2517, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.40625, "rewards/rejected": -10.125, "step": 14400 }, { "epoch": 0.9971628261019999, "grad_norm": 20.577037900954828, "learning_rate": 1.2261771214050477e-11, "logits/chosen": -2.96875, "logits/rejected": -3.203125, "logps/chosen": -656.0, "logps/rejected": -1192.0, "loss": 0.2399, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.375, "rewards/rejected": -10.125, "step": 14410 }, { "epoch": 0.9978548197356585, "grad_norm": 30.300079981128967, "learning_rate": 7.009877093816508e-12, "logits/chosen": -2.90625, "logits/rejected": -3.203125, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.2306, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.25, "rewards/rejected": -10.125, "step": 14420 }, { "epoch": 0.998546813369317, "grad_norm": 25.640377975911676, "learning_rate": 3.2168195791548723e-12, "logits/chosen": -2.875, "logits/rejected": -3.203125, "logps/chosen": -700.0, "logps/rejected": -1200.0, "loss": 0.2107, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 5.0625, "rewards/rejected": -10.25, "step": 14430 }, { "epoch": 0.9992388070029756, "grad_norm": 31.67950941263206, "learning_rate": 8.826208044987459e-13, "logits/chosen": -2.953125, "logits/rejected": -3.1875, "logps/chosen": -656.0, "logps/rejected": -1176.0, "loss": 0.2338, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.21875, "rewards/rejected": -10.0625, "step": 14440 }, { "epoch": 0.9999308006366342, "grad_norm": 19.378713417924935, "learning_rate": 7.294391063394911e-15, "logits/chosen": -2.90625, "logits/rejected": -3.21875, "logps/chosen": -664.0, "logps/rejected": -1224.0, "loss": 0.2164, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.75, "rewards/rejected": -10.5625, "step": 14450 }, { "epoch": 0.5740031359784055, "grad_norm": 29.873738013422592, "learning_rate": 2.2908261047546874e-07, "logits/chosen": -2.6875, "logits/rejected": -2.875, "logps/chosen": -660.0, "logps/rejected": -1104.0, "loss": 0.3467, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": -4.90625, "rewards/margins": 4.46875, "rewards/rejected": -9.375, "step": 14460 }, { "epoch": 0.5744000952702301, "grad_norm": 44.27216990313354, "learning_rate": 2.2873741245752244e-07, "logits/chosen": -2.65625, "logits/rejected": -3.015625, "logps/chosen": -640.0, "logps/rejected": -1128.0, "loss": 0.3289, "rewards/accuracies": 0.9375, "rewards/chosen": -4.6875, "rewards/margins": 5.0625, "rewards/rejected": -9.75, "step": 14470 }, { "epoch": 0.5747970545620547, "grad_norm": 31.15521700224348, "learning_rate": 2.2839225526911142e-07, "logits/chosen": -2.78125, "logits/rejected": -2.984375, "logps/chosen": -692.0, "logps/rejected": -1112.0, "loss": 0.3458, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.34375, "rewards/rejected": -9.4375, "step": 14480 }, { "epoch": 0.5751940138538792, "grad_norm": 37.50446014125498, "learning_rate": 2.2804713957302465e-07, "logits/chosen": -2.875, "logits/rejected": -2.921875, "logps/chosen": -636.0, "logps/rejected": -1104.0, "loss": 0.3146, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.65625, "rewards/rejected": -9.3125, "step": 14490 }, { "epoch": 0.575590973145704, "grad_norm": 42.77381216680129, "learning_rate": 2.2770206603197132e-07, "logits/chosen": -2.765625, "logits/rejected": -3.140625, "logps/chosen": -704.0, "logps/rejected": -1096.0, "loss": 0.3301, "rewards/accuracies": 0.90625, "rewards/chosen": -5.09375, "rewards/margins": 4.21875, "rewards/rejected": -9.3125, "step": 14500 }, { "epoch": 0.5759879324375285, "grad_norm": 22.850645750898675, "learning_rate": 2.2735703530857975e-07, "logits/chosen": -2.84375, "logits/rejected": -2.984375, "logps/chosen": -672.0, "logps/rejected": -1104.0, "loss": 0.3009, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 4.46875, "rewards/rejected": -9.375, "step": 14510 }, { "epoch": 0.5763848917293531, "grad_norm": 24.123330414419726, "learning_rate": 2.27012048065396e-07, "logits/chosen": -2.625, "logits/rejected": -2.734375, "logps/chosen": -640.0, "logps/rejected": -1112.0, "loss": 0.3777, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.625, "rewards/margins": 4.78125, "rewards/rejected": -9.375, "step": 14520 }, { "epoch": 0.5767818510211777, "grad_norm": 26.753579204892954, "learning_rate": 2.266671049648826e-07, "logits/chosen": -2.78125, "logits/rejected": -2.796875, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.2872, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 4.75, "rewards/rejected": -9.625, "step": 14530 }, { "epoch": 0.5771788103130024, "grad_norm": 23.674780996385582, "learning_rate": 2.263222066694176e-07, "logits/chosen": -2.6875, "logits/rejected": -2.953125, "logps/chosen": -676.0, "logps/rejected": -1096.0, "loss": 0.3481, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 4.34375, "rewards/rejected": -9.25, "step": 14540 }, { "epoch": 0.577575769604827, "grad_norm": 29.466226977048176, "learning_rate": 2.2597735384129255e-07, "logits/chosen": -2.671875, "logits/rejected": -2.75, "logps/chosen": -664.0, "logps/rejected": -1088.0, "loss": 0.3358, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.375, "rewards/rejected": -9.3125, "step": 14550 }, { "epoch": 0.5779727288966516, "grad_norm": 23.94450720120626, "learning_rate": 2.2563254714271207e-07, "logits/chosen": -2.71875, "logits/rejected": -2.9375, "logps/chosen": -628.0, "logps/rejected": -1072.0, "loss": 0.3177, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 4.46875, "rewards/rejected": -9.125, "step": 14560 }, { "epoch": 0.5783696881884762, "grad_norm": 24.118968071587606, "learning_rate": 2.2528778723579194e-07, "logits/chosen": -2.59375, "logits/rejected": -2.828125, "logps/chosen": -636.0, "logps/rejected": -1088.0, "loss": 0.321, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 4.5625, "rewards/rejected": -9.25, "step": 14570 }, { "epoch": 0.5787666474803009, "grad_norm": 23.59081922657581, "learning_rate": 2.2494307478255837e-07, "logits/chosen": -2.765625, "logits/rejected": -2.984375, "logps/chosen": -680.0, "logps/rejected": -1064.0, "loss": 0.3138, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 4.09375, "rewards/rejected": -8.9375, "step": 14580 }, { "epoch": 0.5791636067721255, "grad_norm": 31.62002657189468, "learning_rate": 2.2459841044494608e-07, "logits/chosen": -2.78125, "logits/rejected": -2.875, "logps/chosen": -676.0, "logps/rejected": -1104.0, "loss": 0.2915, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.5, "rewards/rejected": -9.3125, "step": 14590 }, { "epoch": 0.5795605660639501, "grad_norm": 33.890833638883215, "learning_rate": 2.2425379488479774e-07, "logits/chosen": -2.765625, "logits/rejected": -2.984375, "logps/chosen": -672.0, "logps/rejected": -1136.0, "loss": 0.2912, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 4.5625, "rewards/rejected": -9.5625, "step": 14600 }, { "epoch": 0.5799575253557747, "grad_norm": 28.023615122309632, "learning_rate": 2.2390922876386215e-07, "logits/chosen": -2.734375, "logits/rejected": -3.0, "logps/chosen": -648.0, "logps/rejected": -1080.0, "loss": 0.3341, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.875, "rewards/margins": 4.34375, "rewards/rejected": -9.1875, "step": 14610 }, { "epoch": 0.5803544846475994, "grad_norm": 34.67667056983191, "learning_rate": 2.2356471274379339e-07, "logits/chosen": -2.796875, "logits/rejected": -2.9375, "logps/chosen": -692.0, "logps/rejected": -1072.0, "loss": 0.3719, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 3.796875, "rewards/rejected": -9.0, "step": 14620 }, { "epoch": 0.580751443939424, "grad_norm": 30.560138119232576, "learning_rate": 2.2322024748614895e-07, "logits/chosen": -2.703125, "logits/rejected": -2.71875, "logps/chosen": -692.0, "logps/rejected": -1152.0, "loss": 0.3398, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.21875, "rewards/margins": 4.40625, "rewards/rejected": -9.625, "step": 14630 }, { "epoch": 0.5811484032312486, "grad_norm": 28.298931645923492, "learning_rate": 2.2287583365238916e-07, "logits/chosen": -2.734375, "logits/rejected": -2.75, "logps/chosen": -660.0, "logps/rejected": -1088.0, "loss": 0.3257, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 4.4375, "rewards/rejected": -9.3125, "step": 14640 }, { "epoch": 0.5815453625230732, "grad_norm": 23.94465490841282, "learning_rate": 2.2253147190387567e-07, "logits/chosen": -2.640625, "logits/rejected": -2.65625, "logps/chosen": -676.0, "logps/rejected": -1104.0, "loss": 0.2901, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.65625, "rewards/rejected": -9.5, "step": 14650 }, { "epoch": 0.5819423218148979, "grad_norm": 22.73787661422149, "learning_rate": 2.221871629018698e-07, "logits/chosen": -2.6875, "logits/rejected": -2.921875, "logps/chosen": -696.0, "logps/rejected": -1128.0, "loss": 0.3218, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.125, "rewards/margins": 4.59375, "rewards/rejected": -9.75, "step": 14660 }, { "epoch": 0.5823392811067225, "grad_norm": 36.58961271609498, "learning_rate": 2.218429073075318e-07, "logits/chosen": -2.671875, "logits/rejected": -2.8125, "logps/chosen": -692.0, "logps/rejected": -1168.0, "loss": 0.3111, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.75, "rewards/rejected": -9.75, "step": 14670 }, { "epoch": 0.5827362403985471, "grad_norm": 30.414492543983684, "learning_rate": 2.2149870578191925e-07, "logits/chosen": -2.75, "logits/rejected": -3.0, "logps/chosen": -624.0, "logps/rejected": -1080.0, "loss": 0.3107, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 4.65625, "rewards/rejected": -9.375, "step": 14680 }, { "epoch": 0.5831331996903718, "grad_norm": 72.98089983623223, "learning_rate": 2.2115455898598598e-07, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -692.0, "logps/rejected": -1096.0, "loss": 0.3547, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.28125, "rewards/margins": 4.125, "rewards/rejected": -9.375, "step": 14690 }, { "epoch": 0.5835301589821964, "grad_norm": 36.87102253348991, "learning_rate": 2.2081046758058084e-07, "logits/chosen": -2.71875, "logits/rejected": -2.9375, "logps/chosen": -664.0, "logps/rejected": -1064.0, "loss": 0.3296, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.09375, "rewards/rejected": -8.9375, "step": 14700 }, { "epoch": 0.583927118274021, "grad_norm": 36.00937822201201, "learning_rate": 2.2046643222644602e-07, "logits/chosen": -2.6875, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1040.0, "loss": 0.3508, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.875, "rewards/margins": 3.96875, "rewards/rejected": -8.8125, "step": 14710 }, { "epoch": 0.5843240775658456, "grad_norm": 37.241665067444046, "learning_rate": 2.2012245358421638e-07, "logits/chosen": -2.59375, "logits/rejected": -2.875, "logps/chosen": -620.0, "logps/rejected": -1088.0, "loss": 0.3162, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.5625, "rewards/margins": 4.625, "rewards/rejected": -9.1875, "step": 14720 }, { "epoch": 0.5847210368576703, "grad_norm": 35.04826768750344, "learning_rate": 2.197785323144176e-07, "logits/chosen": -2.6875, "logits/rejected": -2.875, "logps/chosen": -652.0, "logps/rejected": -1064.0, "loss": 0.2945, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.28125, "rewards/rejected": -9.125, "step": 14730 }, { "epoch": 0.5851179961494949, "grad_norm": 36.79417446846048, "learning_rate": 2.1943466907746537e-07, "logits/chosen": -2.546875, "logits/rejected": -2.84375, "logps/chosen": -684.0, "logps/rejected": -1112.0, "loss": 0.3271, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0625, "rewards/margins": 4.375, "rewards/rejected": -9.4375, "step": 14740 }, { "epoch": 0.5855149554413195, "grad_norm": 23.67587524581565, "learning_rate": 2.1909086453366403e-07, "logits/chosen": -2.6875, "logits/rejected": -2.921875, "logps/chosen": -692.0, "logps/rejected": -1112.0, "loss": 0.3255, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 4.40625, "rewards/rejected": -9.5625, "step": 14750 }, { "epoch": 0.5859119147331441, "grad_norm": 28.86139288779062, "learning_rate": 2.1874711934320491e-07, "logits/chosen": -2.703125, "logits/rejected": -2.9375, "logps/chosen": -684.0, "logps/rejected": -1056.0, "loss": 0.3401, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.03125, "rewards/margins": 3.859375, "rewards/rejected": -8.875, "step": 14760 }, { "epoch": 0.5863088740249688, "grad_norm": 26.926800024257354, "learning_rate": 2.1840343416616572e-07, "logits/chosen": -2.75, "logits/rejected": -2.734375, "logps/chosen": -704.0, "logps/rejected": -1144.0, "loss": 0.2872, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.5625, "rewards/rejected": -9.625, "step": 14770 }, { "epoch": 0.5867058333167934, "grad_norm": 30.46815354475727, "learning_rate": 2.180598096625086e-07, "logits/chosen": -2.671875, "logits/rejected": -2.890625, "logps/chosen": -672.0, "logps/rejected": -1096.0, "loss": 0.2995, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.34375, "rewards/rejected": -9.25, "step": 14780 }, { "epoch": 0.587102792608618, "grad_norm": 34.880884135597334, "learning_rate": 2.1771624649207933e-07, "logits/chosen": -2.734375, "logits/rejected": -2.859375, "logps/chosen": -668.0, "logps/rejected": -1160.0, "loss": 0.3192, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.75, "rewards/rejected": -9.6875, "step": 14790 }, { "epoch": 0.5874997519004426, "grad_norm": 26.27695795039257, "learning_rate": 2.1737274531460606e-07, "logits/chosen": -2.703125, "logits/rejected": -2.859375, "logps/chosen": -632.0, "logps/rejected": -1064.0, "loss": 0.2812, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.6875, "rewards/margins": 4.3125, "rewards/rejected": -9.0, "step": 14800 }, { "epoch": 0.5878967111922673, "grad_norm": 35.2584299121902, "learning_rate": 2.1702930678969774e-07, "logits/chosen": -2.765625, "logits/rejected": -2.765625, "logps/chosen": -660.0, "logps/rejected": -1080.0, "loss": 0.3235, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.09375, "rewards/rejected": -8.875, "step": 14810 }, { "epoch": 0.5882936704840919, "grad_norm": 27.156546254475973, "learning_rate": 2.166859315768429e-07, "logits/chosen": -2.828125, "logits/rejected": -2.9375, "logps/chosen": -680.0, "logps/rejected": -1096.0, "loss": 0.3169, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.3125, "rewards/rejected": -9.375, "step": 14820 }, { "epoch": 0.5886906297759165, "grad_norm": 26.95905968214636, "learning_rate": 2.163426203354088e-07, "logits/chosen": -2.71875, "logits/rejected": -2.8125, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.3232, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.125, "rewards/margins": 4.6875, "rewards/rejected": -9.8125, "step": 14830 }, { "epoch": 0.5890875890677411, "grad_norm": 29.497529825474466, "learning_rate": 2.1599937372463952e-07, "logits/chosen": -2.71875, "logits/rejected": -2.78125, "logps/chosen": -692.0, "logps/rejected": -1136.0, "loss": 0.2462, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.5625, "rewards/rejected": -9.625, "step": 14840 }, { "epoch": 0.5894845483595658, "grad_norm": 31.890341640983884, "learning_rate": 2.1565619240365524e-07, "logits/chosen": -2.75, "logits/rejected": -2.921875, "logps/chosen": -692.0, "logps/rejected": -1120.0, "loss": 0.3061, "rewards/accuracies": 0.9375, "rewards/chosen": -5.125, "rewards/margins": 4.3125, "rewards/rejected": -9.4375, "step": 14850 }, { "epoch": 0.5898815076513904, "grad_norm": 29.536613059121578, "learning_rate": 2.153130770314508e-07, "logits/chosen": -2.78125, "logits/rejected": -3.078125, "logps/chosen": -696.0, "logps/rejected": -1136.0, "loss": 0.2664, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.15625, "rewards/margins": 4.625, "rewards/rejected": -9.8125, "step": 14860 }, { "epoch": 0.590278466943215, "grad_norm": 24.72878256139135, "learning_rate": 2.1497002826689425e-07, "logits/chosen": -2.734375, "logits/rejected": -3.015625, "logps/chosen": -684.0, "logps/rejected": -1096.0, "loss": 0.3218, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0, "rewards/margins": 4.375, "rewards/rejected": -9.375, "step": 14870 }, { "epoch": 0.5906754262350395, "grad_norm": 43.07630474346726, "learning_rate": 2.146270467687259e-07, "logits/chosen": -2.78125, "logits/rejected": -3.0, "logps/chosen": -672.0, "logps/rejected": -1128.0, "loss": 0.3067, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.5625, "rewards/rejected": -9.5, "step": 14880 }, { "epoch": 0.5910723855268643, "grad_norm": 30.25840008415704, "learning_rate": 2.1428413319555666e-07, "logits/chosen": -2.765625, "logits/rejected": -2.90625, "logps/chosen": -652.0, "logps/rejected": -1120.0, "loss": 0.2762, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.65625, "rewards/rejected": -9.5, "step": 14890 }, { "epoch": 0.5914693448186888, "grad_norm": 47.80553444422609, "learning_rate": 2.1394128820586718e-07, "logits/chosen": -2.6875, "logits/rejected": -2.90625, "logps/chosen": -688.0, "logps/rejected": -1096.0, "loss": 0.3198, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.21875, "rewards/rejected": -9.25, "step": 14900 }, { "epoch": 0.5918663041105134, "grad_norm": 48.8689710481363, "learning_rate": 2.1359851245800653e-07, "logits/chosen": -2.59375, "logits/rejected": -2.796875, "logps/chosen": -708.0, "logps/rejected": -1120.0, "loss": 0.375, "rewards/accuracies": 0.90625, "rewards/chosen": -5.25, "rewards/margins": 4.3125, "rewards/rejected": -9.5625, "step": 14910 }, { "epoch": 0.592263263402338, "grad_norm": 20.767620154637566, "learning_rate": 2.1325580661019054e-07, "logits/chosen": -2.78125, "logits/rejected": -3.03125, "logps/chosen": -640.0, "logps/rejected": -1048.0, "loss": 0.3097, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 4.1875, "rewards/rejected": -8.875, "step": 14920 }, { "epoch": 0.5926602226941627, "grad_norm": 13.547686794906808, "learning_rate": 2.1291317132050102e-07, "logits/chosen": -2.703125, "logits/rejected": -2.921875, "logps/chosen": -696.0, "logps/rejected": -1120.0, "loss": 0.2851, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.53125, "rewards/rejected": -9.5, "step": 14930 }, { "epoch": 0.5930571819859873, "grad_norm": 20.05932919443985, "learning_rate": 2.1257060724688412e-07, "logits/chosen": -2.703125, "logits/rejected": -2.734375, "logps/chosen": -712.0, "logps/rejected": -1152.0, "loss": 0.3197, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.40625, "rewards/margins": 4.21875, "rewards/rejected": -9.625, "step": 14940 }, { "epoch": 0.5934541412778119, "grad_norm": 34.73691328653852, "learning_rate": 2.1222811504714933e-07, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -696.0, "logps/rejected": -1128.0, "loss": 0.3125, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.40625, "rewards/margins": 4.375, "rewards/rejected": -9.75, "step": 14950 }, { "epoch": 0.5938511005696366, "grad_norm": 21.061479832710585, "learning_rate": 2.1188569537896823e-07, "logits/chosen": -2.71875, "logits/rejected": -3.125, "logps/chosen": -680.0, "logps/rejected": -1128.0, "loss": 0.3138, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.5625, "rewards/rejected": -9.6875, "step": 14960 }, { "epoch": 0.5942480598614612, "grad_norm": 27.706091398023656, "learning_rate": 2.1154334889987303e-07, "logits/chosen": -2.765625, "logits/rejected": -2.71875, "logps/chosen": -708.0, "logps/rejected": -1168.0, "loss": 0.3332, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.21875, "rewards/margins": 4.65625, "rewards/rejected": -9.875, "step": 14970 }, { "epoch": 0.5946450191532858, "grad_norm": 25.22028423999329, "learning_rate": 2.1120107626725525e-07, "logits/chosen": -2.8125, "logits/rejected": -3.125, "logps/chosen": -656.0, "logps/rejected": -1104.0, "loss": 0.291, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 4.59375, "rewards/rejected": -9.5, "step": 14980 }, { "epoch": 0.5950419784451104, "grad_norm": 30.306595140078617, "learning_rate": 2.108588781383649e-07, "logits/chosen": -2.84375, "logits/rejected": -2.9375, "logps/chosen": -692.0, "logps/rejected": -1136.0, "loss": 0.3139, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.90625, "rewards/margins": 4.5625, "rewards/rejected": -9.5, "step": 14990 }, { "epoch": 0.5954389377369351, "grad_norm": 29.17228818762523, "learning_rate": 2.1051675517030854e-07, "logits/chosen": -2.71875, "logits/rejected": -3.125, "logps/chosen": -680.0, "logps/rejected": -1104.0, "loss": 0.3076, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.28125, "rewards/rejected": -9.375, "step": 15000 }, { "epoch": 0.5954389377369351, "eval_logits/chosen": -2.75, "eval_logits/rejected": -2.953125, "eval_logps/chosen": -712.0, "eval_logps/rejected": -1080.0, "eval_loss": 0.24539342522621155, "eval_rewards/accuracies": 0.8939311504364014, "eval_rewards/chosen": -5.28125, "eval_rewards/margins": 3.84375, "eval_rewards/rejected": -9.125, "eval_runtime": 5413.8814, "eval_samples_per_second": 32.627, "eval_steps_per_second": 0.51, "step": 15000 }, { "epoch": 0.5958358970287597, "grad_norm": 22.6986723532102, "learning_rate": 2.1017470802004886e-07, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.3273, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 4.5625, "rewards/rejected": -9.5625, "step": 15010 }, { "epoch": 0.5962328563205843, "grad_norm": 36.15556445127337, "learning_rate": 2.098327373444027e-07, "logits/chosen": -2.71875, "logits/rejected": -3.078125, "logps/chosen": -684.0, "logps/rejected": -1112.0, "loss": 0.318, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 4.46875, "rewards/rejected": -9.625, "step": 15020 }, { "epoch": 0.5966298156124089, "grad_norm": 36.093005145756464, "learning_rate": 2.0949084380003992e-07, "logits/chosen": -2.828125, "logits/rejected": -3.125, "logps/chosen": -656.0, "logps/rejected": -1112.0, "loss": 0.2485, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.78125, "rewards/margins": 4.5625, "rewards/rejected": -9.375, "step": 15030 }, { "epoch": 0.5970267749042336, "grad_norm": 22.343719852832272, "learning_rate": 2.0914902804348263e-07, "logits/chosen": -2.71875, "logits/rejected": -2.921875, "logps/chosen": -696.0, "logps/rejected": -1200.0, "loss": 0.2256, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.21875, "rewards/margins": 4.9375, "rewards/rejected": -10.1875, "step": 15040 }, { "epoch": 0.5974237341960582, "grad_norm": 30.156257725197804, "learning_rate": 2.0880729073110314e-07, "logits/chosen": -2.78125, "logits/rejected": -3.109375, "logps/chosen": -684.0, "logps/rejected": -1128.0, "loss": 0.2433, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.6875, "rewards/rejected": -9.75, "step": 15050 }, { "epoch": 0.5978206934878828, "grad_norm": 37.82010856090376, "learning_rate": 2.0846563251912351e-07, "logits/chosen": -2.796875, "logits/rejected": -3.1875, "logps/chosen": -680.0, "logps/rejected": -1144.0, "loss": 0.3596, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 4.75, "rewards/rejected": -9.75, "step": 15060 }, { "epoch": 0.5982176527797074, "grad_norm": 32.677541433091356, "learning_rate": 2.0812405406361382e-07, "logits/chosen": -2.6875, "logits/rejected": -3.09375, "logps/chosen": -668.0, "logps/rejected": -1096.0, "loss": 0.3138, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.59375, "rewards/rejected": -9.375, "step": 15070 }, { "epoch": 0.5986146120715321, "grad_norm": 30.806574820714875, "learning_rate": 2.0778255602049074e-07, "logits/chosen": -2.875, "logits/rejected": -3.046875, "logps/chosen": -648.0, "logps/rejected": -1136.0, "loss": 0.2903, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.875, "rewards/rejected": -9.6875, "step": 15080 }, { "epoch": 0.5990115713633567, "grad_norm": 19.803568924791115, "learning_rate": 2.0744113904551686e-07, "logits/chosen": -2.84375, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1120.0, "loss": 0.294, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0, "rewards/margins": 4.59375, "rewards/rejected": -9.625, "step": 15090 }, { "epoch": 0.5994085306551813, "grad_norm": 29.027818089357897, "learning_rate": 2.0709980379429884e-07, "logits/chosen": -2.8125, "logits/rejected": -2.9375, "logps/chosen": -680.0, "logps/rejected": -1104.0, "loss": 0.3344, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.375, "rewards/rejected": -9.375, "step": 15100 }, { "epoch": 0.5998054899470059, "grad_norm": 30.151080190135577, "learning_rate": 2.067585509222865e-07, "logits/chosen": -2.84375, "logits/rejected": -2.84375, "logps/chosen": -680.0, "logps/rejected": -1128.0, "loss": 0.314, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 4.34375, "rewards/rejected": -9.4375, "step": 15110 }, { "epoch": 0.6002024492388306, "grad_norm": 24.740399992811735, "learning_rate": 2.0641738108477169e-07, "logits/chosen": -2.765625, "logits/rejected": -3.0, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.3551, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.75, "rewards/rejected": -9.6875, "step": 15120 }, { "epoch": 0.6005994085306552, "grad_norm": 25.79812522161424, "learning_rate": 2.0607629493688647e-07, "logits/chosen": -2.765625, "logits/rejected": -2.921875, "logps/chosen": -656.0, "logps/rejected": -1104.0, "loss": 0.253, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 4.53125, "rewards/rejected": -9.25, "step": 15130 }, { "epoch": 0.6009963678224798, "grad_norm": 30.53423665520749, "learning_rate": 2.0573529313360233e-07, "logits/chosen": -2.609375, "logits/rejected": -2.921875, "logps/chosen": -660.0, "logps/rejected": -1128.0, "loss": 0.2706, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.6875, "rewards/rejected": -9.5, "step": 15140 }, { "epoch": 0.6013933271143044, "grad_norm": 26.506063491011748, "learning_rate": 2.0539437632972894e-07, "logits/chosen": -2.65625, "logits/rejected": -2.984375, "logps/chosen": -696.0, "logps/rejected": -1112.0, "loss": 0.29, "rewards/accuracies": 0.90625, "rewards/chosen": -5.21875, "rewards/margins": 4.3125, "rewards/rejected": -9.5, "step": 15150 }, { "epoch": 0.6017902864061291, "grad_norm": 34.38460850066219, "learning_rate": 2.050535451799124e-07, "logits/chosen": -2.78125, "logits/rejected": -2.921875, "logps/chosen": -644.0, "logps/rejected": -1072.0, "loss": 0.2947, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 4.375, "rewards/rejected": -9.125, "step": 15160 }, { "epoch": 0.6021872456979537, "grad_norm": 30.15988623389949, "learning_rate": 2.0471280033863472e-07, "logits/chosen": -2.625, "logits/rejected": -2.890625, "logps/chosen": -668.0, "logps/rejected": -1112.0, "loss": 0.2967, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.625, "rewards/rejected": -9.5625, "step": 15170 }, { "epoch": 0.6025842049897783, "grad_norm": 25.82426245503198, "learning_rate": 2.0437214246021207e-07, "logits/chosen": -2.546875, "logits/rejected": -2.875, "logps/chosen": -688.0, "logps/rejected": -1128.0, "loss": 0.3065, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.5625, "rewards/rejected": -9.4375, "step": 15180 }, { "epoch": 0.602981164281603, "grad_norm": 28.444507444064534, "learning_rate": 2.0403157219879336e-07, "logits/chosen": -2.65625, "logits/rejected": -2.8125, "logps/chosen": -692.0, "logps/rejected": -1080.0, "loss": 0.2842, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.1875, "rewards/margins": 4.03125, "rewards/rejected": -9.25, "step": 15190 }, { "epoch": 0.6033781235734276, "grad_norm": 27.76869100476306, "learning_rate": 2.0369109020835965e-07, "logits/chosen": -2.765625, "logits/rejected": -2.984375, "logps/chosen": -672.0, "logps/rejected": -1128.0, "loss": 0.2909, "rewards/accuracies": 0.90625, "rewards/chosen": -5.0, "rewards/margins": 4.59375, "rewards/rejected": -9.625, "step": 15200 }, { "epoch": 0.6037750828652522, "grad_norm": 31.756995911706024, "learning_rate": 2.0335069714272208e-07, "logits/chosen": -2.71875, "logits/rejected": -3.0625, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.2807, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.125, "rewards/margins": 5.0, "rewards/rejected": -10.125, "step": 15210 }, { "epoch": 0.6041720421570768, "grad_norm": 25.4538398612401, "learning_rate": 2.0301039365552148e-07, "logits/chosen": -2.84375, "logits/rejected": -3.140625, "logps/chosen": -660.0, "logps/rejected": -1120.0, "loss": 0.2821, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 4.59375, "rewards/rejected": -9.5625, "step": 15220 }, { "epoch": 0.6045690014489015, "grad_norm": 26.065783432800245, "learning_rate": 2.0267018040022642e-07, "logits/chosen": -2.65625, "logits/rejected": -2.921875, "logps/chosen": -628.0, "logps/rejected": -1096.0, "loss": 0.2907, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.625, "rewards/margins": 4.71875, "rewards/rejected": -9.3125, "step": 15230 }, { "epoch": 0.6049659607407261, "grad_norm": 17.90650886925799, "learning_rate": 2.0233005803013212e-07, "logits/chosen": -2.921875, "logits/rejected": -3.203125, "logps/chosen": -672.0, "logps/rejected": -1112.0, "loss": 0.315, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.125, "rewards/margins": 4.5625, "rewards/rejected": -9.6875, "step": 15240 }, { "epoch": 0.6053629200325507, "grad_norm": 16.006654455004654, "learning_rate": 2.0199002719835952e-07, "logits/chosen": -2.765625, "logits/rejected": -2.953125, "logps/chosen": -684.0, "logps/rejected": -1136.0, "loss": 0.2561, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.71875, "rewards/rejected": -9.8125, "step": 15250 }, { "epoch": 0.6057598793243752, "grad_norm": 27.055764153142622, "learning_rate": 2.0165008855785342e-07, "logits/chosen": -2.625, "logits/rejected": -2.828125, "logps/chosen": -636.0, "logps/rejected": -1104.0, "loss": 0.2659, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.625, "rewards/margins": 4.78125, "rewards/rejected": -9.4375, "step": 15260 }, { "epoch": 0.6061568386162, "grad_norm": 24.331124114299346, "learning_rate": 2.0131024276138203e-07, "logits/chosen": -2.75, "logits/rejected": -2.734375, "logps/chosen": -712.0, "logps/rejected": -1152.0, "loss": 0.2984, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.28125, "rewards/margins": 4.65625, "rewards/rejected": -9.9375, "step": 15270 }, { "epoch": 0.6065537979080246, "grad_norm": 34.665278131287636, "learning_rate": 2.0097049046153508e-07, "logits/chosen": -2.78125, "logits/rejected": -3.109375, "logps/chosen": -664.0, "logps/rejected": -1096.0, "loss": 0.3078, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.5, "rewards/rejected": -9.4375, "step": 15280 }, { "epoch": 0.6069507571998491, "grad_norm": 36.60096401248971, "learning_rate": 2.0063083231072262e-07, "logits/chosen": -2.71875, "logits/rejected": -2.9375, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.2973, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.8125, "rewards/rejected": -9.75, "step": 15290 }, { "epoch": 0.6073477164916737, "grad_norm": 30.92939987773986, "learning_rate": 2.0029126896117413e-07, "logits/chosen": -2.71875, "logits/rejected": -2.859375, "logps/chosen": -688.0, "logps/rejected": -1112.0, "loss": 0.3224, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.125, "rewards/margins": 4.46875, "rewards/rejected": -9.5625, "step": 15300 }, { "epoch": 0.6077446757834984, "grad_norm": 28.014728073607557, "learning_rate": 1.9995180106493673e-07, "logits/chosen": -2.65625, "logits/rejected": -2.875, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2693, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 4.75, "rewards/rejected": -9.75, "step": 15310 }, { "epoch": 0.608141635075323, "grad_norm": 25.066379196224837, "learning_rate": 1.9961242927387484e-07, "logits/chosen": -2.640625, "logits/rejected": -2.96875, "logps/chosen": -628.0, "logps/rejected": -1104.0, "loss": 0.2723, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.6875, "rewards/margins": 4.75, "rewards/rejected": -9.4375, "step": 15320 }, { "epoch": 0.6085385943671476, "grad_norm": 27.051046192194587, "learning_rate": 1.9927315423966772e-07, "logits/chosen": -2.671875, "logits/rejected": -3.0625, "logps/chosen": -660.0, "logps/rejected": -1128.0, "loss": 0.2627, "rewards/accuracies": 0.9375, "rewards/chosen": -4.8125, "rewards/margins": 4.875, "rewards/rejected": -9.6875, "step": 15330 }, { "epoch": 0.6089355536589722, "grad_norm": 21.521277158495227, "learning_rate": 1.9893397661380917e-07, "logits/chosen": -2.84375, "logits/rejected": -3.015625, "logps/chosen": -648.0, "logps/rejected": -1096.0, "loss": 0.2925, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.65625, "rewards/rejected": -9.375, "step": 15340 }, { "epoch": 0.6093325129507969, "grad_norm": 14.86446872137551, "learning_rate": 1.985948970476058e-07, "logits/chosen": -2.84375, "logits/rejected": -2.953125, "logps/chosen": -700.0, "logps/rejected": -1168.0, "loss": 0.2452, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.8125, "rewards/rejected": -10.0, "step": 15350 }, { "epoch": 0.6097294722426215, "grad_norm": 24.27347072652695, "learning_rate": 1.982559161921759e-07, "logits/chosen": -2.703125, "logits/rejected": -2.984375, "logps/chosen": -708.0, "logps/rejected": -1152.0, "loss": 0.2506, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3125, "rewards/margins": 4.65625, "rewards/rejected": -9.9375, "step": 15360 }, { "epoch": 0.6101264315344461, "grad_norm": 21.29026469449461, "learning_rate": 1.9791703469844847e-07, "logits/chosen": -2.71875, "logits/rejected": -2.84375, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2511, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 15370 }, { "epoch": 0.6105233908262707, "grad_norm": 33.71849307874158, "learning_rate": 1.9757825321716137e-07, "logits/chosen": -2.6875, "logits/rejected": -2.984375, "logps/chosen": -720.0, "logps/rejected": -1176.0, "loss": 0.2839, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.34375, "rewards/margins": 4.6875, "rewards/rejected": -10.0625, "step": 15380 }, { "epoch": 0.6109203501180954, "grad_norm": 27.329378480949508, "learning_rate": 1.9723957239886066e-07, "logits/chosen": -2.765625, "logits/rejected": -2.953125, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.2688, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.125, "rewards/rejected": -9.9375, "step": 15390 }, { "epoch": 0.61131730940992, "grad_norm": 22.931782906210447, "learning_rate": 1.9690099289389888e-07, "logits/chosen": -2.875, "logits/rejected": -2.84375, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.2754, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 4.75, "rewards/rejected": -9.8125, "step": 15400 }, { "epoch": 0.6117142687017446, "grad_norm": 29.32795257366406, "learning_rate": 1.9656251535243426e-07, "logits/chosen": -2.671875, "logits/rejected": -2.75, "logps/chosen": -672.0, "logps/rejected": -1136.0, "loss": 0.2715, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.71875, "rewards/rejected": -9.5625, "step": 15410 }, { "epoch": 0.6121112279935692, "grad_norm": 29.01588313241981, "learning_rate": 1.9622414042442897e-07, "logits/chosen": -2.828125, "logits/rejected": -3.1875, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.2732, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.03125, "rewards/rejected": -9.9375, "step": 15420 }, { "epoch": 0.6125081872853939, "grad_norm": 19.928841362195186, "learning_rate": 1.958858687596484e-07, "logits/chosen": -2.765625, "logits/rejected": -2.859375, "logps/chosen": -684.0, "logps/rejected": -1136.0, "loss": 0.2938, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.25, "rewards/margins": 4.34375, "rewards/rejected": -9.625, "step": 15430 }, { "epoch": 0.6129051465772185, "grad_norm": 20.92084030384641, "learning_rate": 1.9554770100765964e-07, "logits/chosen": -2.71875, "logits/rejected": -2.6875, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2766, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 4.71875, "rewards/rejected": -9.875, "step": 15440 }, { "epoch": 0.6133021058690431, "grad_norm": 18.807084901417426, "learning_rate": 1.9520963781782992e-07, "logits/chosen": -2.59375, "logits/rejected": -2.859375, "logps/chosen": -628.0, "logps/rejected": -1096.0, "loss": 0.2663, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 4.8125, "rewards/rejected": -9.375, "step": 15450 }, { "epoch": 0.6136990651608678, "grad_norm": 29.539433659350145, "learning_rate": 1.9487167983932606e-07, "logits/chosen": -2.546875, "logits/rejected": -2.640625, "logps/chosen": -628.0, "logps/rejected": -1096.0, "loss": 0.2413, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 4.59375, "rewards/rejected": -9.3125, "step": 15460 }, { "epoch": 0.6140960244526924, "grad_norm": 22.038626660919654, "learning_rate": 1.9453382772111247e-07, "logits/chosen": -2.734375, "logits/rejected": -2.921875, "logps/chosen": -680.0, "logps/rejected": -1120.0, "loss": 0.2954, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 4.53125, "rewards/rejected": -9.5, "step": 15470 }, { "epoch": 0.614492983744517, "grad_norm": 27.26031743146978, "learning_rate": 1.9419608211195076e-07, "logits/chosen": -2.875, "logits/rejected": -2.8125, "logps/chosen": -704.0, "logps/rejected": -1160.0, "loss": 0.2834, "rewards/accuracies": 0.90625, "rewards/chosen": -5.28125, "rewards/margins": 4.625, "rewards/rejected": -9.9375, "step": 15480 }, { "epoch": 0.6148899430363416, "grad_norm": 38.75920142729775, "learning_rate": 1.9385844366039766e-07, "logits/chosen": -2.734375, "logits/rejected": -2.9375, "logps/chosen": -700.0, "logps/rejected": -1112.0, "loss": 0.2644, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.34375, "rewards/margins": 4.25, "rewards/rejected": -9.5625, "step": 15490 }, { "epoch": 0.6152869023281663, "grad_norm": 26.543515540783083, "learning_rate": 1.935209130148042e-07, "logits/chosen": -2.796875, "logits/rejected": -3.015625, "logps/chosen": -672.0, "logps/rejected": -1120.0, "loss": 0.273, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 4.59375, "rewards/rejected": -9.625, "step": 15500 }, { "epoch": 0.6156838616199909, "grad_norm": 25.023178658325275, "learning_rate": 1.9318349082331443e-07, "logits/chosen": -2.90625, "logits/rejected": -3.1875, "logps/chosen": -696.0, "logps/rejected": -1168.0, "loss": 0.2517, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 4.84375, "rewards/rejected": -10.0625, "step": 15510 }, { "epoch": 0.6160808209118155, "grad_norm": 26.51991699812434, "learning_rate": 1.928461777338641e-07, "logits/chosen": -2.78125, "logits/rejected": -2.953125, "logps/chosen": -668.0, "logps/rejected": -1136.0, "loss": 0.2761, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.78125, "rewards/rejected": -9.75, "step": 15520 }, { "epoch": 0.6164777802036401, "grad_norm": 28.01478518175277, "learning_rate": 1.9250897439417957e-07, "logits/chosen": -2.8125, "logits/rejected": -3.15625, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.272, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 4.6875, "rewards/rejected": -9.6875, "step": 15530 }, { "epoch": 0.6168747394954648, "grad_norm": 20.980426020365872, "learning_rate": 1.9217188145177627e-07, "logits/chosen": -2.84375, "logits/rejected": -2.859375, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2881, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 15540 }, { "epoch": 0.6172716987872894, "grad_norm": 24.230355073422906, "learning_rate": 1.9183489955395785e-07, "logits/chosen": -2.734375, "logits/rejected": -2.96875, "logps/chosen": -684.0, "logps/rejected": -1152.0, "loss": 0.2607, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.125, "rewards/margins": 4.5625, "rewards/rejected": -9.6875, "step": 15550 }, { "epoch": 0.617668658079114, "grad_norm": 27.089516000576776, "learning_rate": 1.914980293478145e-07, "logits/chosen": -2.71875, "logits/rejected": -2.859375, "logps/chosen": -684.0, "logps/rejected": -1096.0, "loss": 0.2608, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.84375, "rewards/margins": 4.3125, "rewards/rejected": -9.1875, "step": 15560 }, { "epoch": 0.6180656173709386, "grad_norm": 25.642422848287577, "learning_rate": 1.9116127148022208e-07, "logits/chosen": -2.8125, "logits/rejected": -2.875, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.2923, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 4.75, "rewards/rejected": -9.625, "step": 15570 }, { "epoch": 0.6184625766627633, "grad_norm": 28.08143831645829, "learning_rate": 1.9082462659784081e-07, "logits/chosen": -2.796875, "logits/rejected": -3.09375, "logps/chosen": -676.0, "logps/rejected": -1152.0, "loss": 0.3021, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.84375, "rewards/rejected": -10.0, "step": 15580 }, { "epoch": 0.6188595359545879, "grad_norm": 16.184476414695695, "learning_rate": 1.904880953471137e-07, "logits/chosen": -2.828125, "logits/rejected": -3.046875, "logps/chosen": -704.0, "logps/rejected": -1104.0, "loss": 0.2908, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.34375, "rewards/margins": 4.09375, "rewards/rejected": -9.4375, "step": 15590 }, { "epoch": 0.6192564952464125, "grad_norm": 25.051420624228697, "learning_rate": 1.9015167837426584e-07, "logits/chosen": -2.71875, "logits/rejected": -3.0, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2637, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 4.84375, "rewards/rejected": -9.6875, "step": 15600 }, { "epoch": 0.6196534545382371, "grad_norm": 21.188587366472873, "learning_rate": 1.8981537632530258e-07, "logits/chosen": -2.78125, "logits/rejected": -2.875, "logps/chosen": -696.0, "logps/rejected": -1112.0, "loss": 0.2662, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.3125, "rewards/rejected": -9.5, "step": 15610 }, { "epoch": 0.6200504138300618, "grad_norm": 26.445976579684515, "learning_rate": 1.8947918984600877e-07, "logits/chosen": -2.875, "logits/rejected": -3.125, "logps/chosen": -648.0, "logps/rejected": -1120.0, "loss": 0.2667, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 4.875, "rewards/rejected": -9.625, "step": 15620 }, { "epoch": 0.6204473731218864, "grad_norm": 27.287139666469244, "learning_rate": 1.8914311958194747e-07, "logits/chosen": -2.84375, "logits/rejected": -2.953125, "logps/chosen": -672.0, "logps/rejected": -1120.0, "loss": 0.2716, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.4375, "rewards/rejected": -9.375, "step": 15630 }, { "epoch": 0.620844332413711, "grad_norm": 34.692606322795974, "learning_rate": 1.8880716617845827e-07, "logits/chosen": -2.796875, "logits/rejected": -2.96875, "logps/chosen": -696.0, "logps/rejected": -1176.0, "loss": 0.2578, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.34375, "rewards/margins": 4.875, "rewards/rejected": -10.25, "step": 15640 }, { "epoch": 0.6212412917055355, "grad_norm": 23.925821160770145, "learning_rate": 1.8847133028065658e-07, "logits/chosen": -2.890625, "logits/rejected": -3.09375, "logps/chosen": -700.0, "logps/rejected": -1192.0, "loss": 0.3119, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 4.96875, "rewards/rejected": -10.125, "step": 15650 }, { "epoch": 0.6216382509973603, "grad_norm": 34.358535459812394, "learning_rate": 1.8813561253343208e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2797, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.71875, "rewards/rejected": -9.875, "step": 15660 }, { "epoch": 0.6220352102891848, "grad_norm": 26.58925515383467, "learning_rate": 1.878000135814475e-07, "logits/chosen": -2.671875, "logits/rejected": -2.84375, "logps/chosen": -636.0, "logps/rejected": -1136.0, "loss": 0.2522, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 4.9375, "rewards/rejected": -9.6875, "step": 15670 }, { "epoch": 0.6224321695810094, "grad_norm": 43.228319947383845, "learning_rate": 1.874645340691376e-07, "logits/chosen": -2.765625, "logits/rejected": -2.875, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2619, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 15680 }, { "epoch": 0.622829128872834, "grad_norm": 27.82628699246336, "learning_rate": 1.8712917464070782e-07, "logits/chosen": -2.75, "logits/rejected": -2.921875, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.2681, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 4.875, "rewards/rejected": -9.75, "step": 15690 }, { "epoch": 0.6232260881646587, "grad_norm": 31.13897601796482, "learning_rate": 1.8679393594013283e-07, "logits/chosen": -2.78125, "logits/rejected": -3.015625, "logps/chosen": -708.0, "logps/rejected": -1168.0, "loss": 0.2545, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.28125, "rewards/margins": 4.78125, "rewards/rejected": -10.0625, "step": 15700 }, { "epoch": 0.6236230474564833, "grad_norm": 29.95066374527897, "learning_rate": 1.8645881861115557e-07, "logits/chosen": -2.75, "logits/rejected": -2.8125, "logps/chosen": -704.0, "logps/rejected": -1200.0, "loss": 0.2622, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.1875, "rewards/margins": 5.09375, "rewards/rejected": -10.3125, "step": 15710 }, { "epoch": 0.6240200067483079, "grad_norm": 20.325184843748865, "learning_rate": 1.8612382329728588e-07, "logits/chosen": -2.75, "logits/rejected": -2.96875, "logps/chosen": -652.0, "logps/rejected": -1128.0, "loss": 0.3045, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 4.65625, "rewards/rejected": -9.5625, "step": 15720 }, { "epoch": 0.6244169660401326, "grad_norm": 20.608765141248863, "learning_rate": 1.8578895064179925e-07, "logits/chosen": -2.8125, "logits/rejected": -2.96875, "logps/chosen": -696.0, "logps/rejected": -1184.0, "loss": 0.2605, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.21875, "rewards/margins": 4.90625, "rewards/rejected": -10.125, "step": 15730 }, { "epoch": 0.6248139253319572, "grad_norm": 25.606400367810437, "learning_rate": 1.8545420128773593e-07, "logits/chosen": -2.8125, "logits/rejected": -3.171875, "logps/chosen": -680.0, "logps/rejected": -1136.0, "loss": 0.2908, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.71875, "rewards/rejected": -9.8125, "step": 15740 }, { "epoch": 0.6252108846237818, "grad_norm": 23.079636218500102, "learning_rate": 1.85119575877899e-07, "logits/chosen": -2.71875, "logits/rejected": -3.015625, "logps/chosen": -660.0, "logps/rejected": -1128.0, "loss": 0.2443, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.71875, "rewards/rejected": -9.625, "step": 15750 }, { "epoch": 0.6256078439156064, "grad_norm": 23.382517461857063, "learning_rate": 1.847850750548538e-07, "logits/chosen": -2.796875, "logits/rejected": -3.109375, "logps/chosen": -652.0, "logps/rejected": -1104.0, "loss": 0.2664, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 4.625, "rewards/rejected": -9.375, "step": 15760 }, { "epoch": 0.6260048032074311, "grad_norm": 34.01294685460321, "learning_rate": 1.8445069946092627e-07, "logits/chosen": -2.65625, "logits/rejected": -2.953125, "logps/chosen": -696.0, "logps/rejected": -1144.0, "loss": 0.2648, "rewards/accuracies": 0.9375, "rewards/chosen": -5.125, "rewards/margins": 4.5, "rewards/rejected": -9.625, "step": 15770 }, { "epoch": 0.6264017624992557, "grad_norm": 23.05499845807405, "learning_rate": 1.8411644973820194e-07, "logits/chosen": -2.75, "logits/rejected": -3.0625, "logps/chosen": -664.0, "logps/rejected": -1080.0, "loss": 0.2461, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 4.28125, "rewards/rejected": -9.25, "step": 15780 }, { "epoch": 0.6267987217910803, "grad_norm": 34.155782127267386, "learning_rate": 1.8378232652852483e-07, "logits/chosen": -2.6875, "logits/rejected": -2.890625, "logps/chosen": -692.0, "logps/rejected": -1144.0, "loss": 0.2937, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.65625, "rewards/rejected": -9.75, "step": 15790 }, { "epoch": 0.6271956810829049, "grad_norm": 26.377688367602012, "learning_rate": 1.8344833047349562e-07, "logits/chosen": -2.65625, "logits/rejected": -2.6875, "logps/chosen": -704.0, "logps/rejected": -1232.0, "loss": 0.2953, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.28125, "rewards/margins": 5.3125, "rewards/rejected": -10.5625, "step": 15800 }, { "epoch": 0.6275926403747296, "grad_norm": 34.8997470821859, "learning_rate": 1.8311446221447125e-07, "logits/chosen": -2.828125, "logits/rejected": -2.890625, "logps/chosen": -704.0, "logps/rejected": -1192.0, "loss": 0.2868, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.1875, "rewards/margins": 5.09375, "rewards/rejected": -10.3125, "step": 15810 }, { "epoch": 0.6279895996665542, "grad_norm": 23.06450345449958, "learning_rate": 1.8278072239256284e-07, "logits/chosen": -2.859375, "logits/rejected": -3.046875, "logps/chosen": -716.0, "logps/rejected": -1120.0, "loss": 0.289, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.40625, "rewards/margins": 4.15625, "rewards/rejected": -9.5625, "step": 15820 }, { "epoch": 0.6283865589583788, "grad_norm": 33.06870613551019, "learning_rate": 1.8244711164863517e-07, "logits/chosen": -2.8125, "logits/rejected": -2.9375, "logps/chosen": -700.0, "logps/rejected": -1120.0, "loss": 0.3008, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.25, "rewards/margins": 4.25, "rewards/rejected": -9.5, "step": 15830 }, { "epoch": 0.6287835182502034, "grad_norm": 19.403343506987124, "learning_rate": 1.8211363062330515e-07, "logits/chosen": -2.734375, "logits/rejected": -3.015625, "logps/chosen": -676.0, "logps/rejected": -1088.0, "loss": 0.2819, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.125, "rewards/margins": 4.15625, "rewards/rejected": -9.3125, "step": 15840 }, { "epoch": 0.6291804775420281, "grad_norm": 28.938172561423027, "learning_rate": 1.8178027995694056e-07, "logits/chosen": -2.703125, "logits/rejected": -2.828125, "logps/chosen": -676.0, "logps/rejected": -1128.0, "loss": 0.2655, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.8125, "rewards/margins": 4.71875, "rewards/rejected": -9.5625, "step": 15850 }, { "epoch": 0.6295774368338527, "grad_norm": 18.333611422371625, "learning_rate": 1.8144706028965867e-07, "logits/chosen": -2.703125, "logits/rejected": -2.828125, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.3081, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.84375, "rewards/rejected": -9.875, "step": 15860 }, { "epoch": 0.6299743961256773, "grad_norm": 26.360969685137682, "learning_rate": 1.811139722613255e-07, "logits/chosen": -2.875, "logits/rejected": -3.140625, "logps/chosen": -700.0, "logps/rejected": -1144.0, "loss": 0.2652, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.28125, "rewards/margins": 4.4375, "rewards/rejected": -9.75, "step": 15870 }, { "epoch": 0.6303713554175019, "grad_norm": 35.881725815598514, "learning_rate": 1.8078101651155397e-07, "logits/chosen": -2.828125, "logits/rejected": -3.09375, "logps/chosen": -692.0, "logps/rejected": -1160.0, "loss": 0.2483, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.21875, "rewards/margins": 4.71875, "rewards/rejected": -9.9375, "step": 15880 }, { "epoch": 0.6307683147093266, "grad_norm": 27.214389568557362, "learning_rate": 1.8044819367970332e-07, "logits/chosen": -2.671875, "logits/rejected": -2.9375, "logps/chosen": -684.0, "logps/rejected": -1144.0, "loss": 0.2582, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.71875, "rewards/rejected": -9.75, "step": 15890 }, { "epoch": 0.6311652740011512, "grad_norm": 24.657569753447333, "learning_rate": 1.8011550440487743e-07, "logits/chosen": -2.796875, "logits/rejected": -2.796875, "logps/chosen": -720.0, "logps/rejected": -1168.0, "loss": 0.2817, "rewards/accuracies": 0.9375, "rewards/chosen": -5.4375, "rewards/margins": 4.5, "rewards/rejected": -9.9375, "step": 15900 }, { "epoch": 0.6315622332929758, "grad_norm": 43.181200910154416, "learning_rate": 1.7978294932592352e-07, "logits/chosen": -2.734375, "logits/rejected": -2.90625, "logps/chosen": -680.0, "logps/rejected": -1184.0, "loss": 0.2729, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 5.03125, "rewards/rejected": -10.0625, "step": 15910 }, { "epoch": 0.6319591925848004, "grad_norm": 16.643639286835807, "learning_rate": 1.7945052908143147e-07, "logits/chosen": -2.703125, "logits/rejected": -2.875, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.2491, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 4.65625, "rewards/rejected": -9.6875, "step": 15920 }, { "epoch": 0.6323561518766251, "grad_norm": 21.56385697157277, "learning_rate": 1.7911824430973188e-07, "logits/chosen": -2.75, "logits/rejected": -3.015625, "logps/chosen": -696.0, "logps/rejected": -1120.0, "loss": 0.2711, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 4.4375, "rewards/rejected": -9.5, "step": 15930 }, { "epoch": 0.6327531111684497, "grad_norm": 23.03058620390345, "learning_rate": 1.7878609564889545e-07, "logits/chosen": -2.796875, "logits/rejected": -2.984375, "logps/chosen": -704.0, "logps/rejected": -1184.0, "loss": 0.239, "rewards/accuracies": 0.9375, "rewards/chosen": -5.28125, "rewards/margins": 4.875, "rewards/rejected": -10.1875, "step": 15940 }, { "epoch": 0.6331500704602743, "grad_norm": 23.996292571751802, "learning_rate": 1.7845408373673155e-07, "logits/chosen": -2.765625, "logits/rejected": -2.8125, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2816, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 4.78125, "rewards/rejected": -9.625, "step": 15950 }, { "epoch": 0.6335470297520989, "grad_norm": 33.76610865770998, "learning_rate": 1.7812220921078675e-07, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.2854, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.71875, "rewards/rejected": -9.6875, "step": 15960 }, { "epoch": 0.6339439890439236, "grad_norm": 24.47026492398987, "learning_rate": 1.7779047270834398e-07, "logits/chosen": -2.828125, "logits/rejected": -3.234375, "logps/chosen": -644.0, "logps/rejected": -1088.0, "loss": 0.2724, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 4.65625, "rewards/rejected": -9.375, "step": 15970 }, { "epoch": 0.6343409483357482, "grad_norm": 33.81706279145583, "learning_rate": 1.7745887486642102e-07, "logits/chosen": -2.640625, "logits/rejected": -2.828125, "logps/chosen": -636.0, "logps/rejected": -1136.0, "loss": 0.2782, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.0, "rewards/rejected": -9.625, "step": 15980 }, { "epoch": 0.6347379076275728, "grad_norm": 26.19233878206822, "learning_rate": 1.7712741632176936e-07, "logits/chosen": -2.703125, "logits/rejected": -2.703125, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.3018, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 15990 }, { "epoch": 0.6351348669193975, "grad_norm": 30.258423766409525, "learning_rate": 1.767960977108732e-07, "logits/chosen": -2.921875, "logits/rejected": -3.09375, "logps/chosen": -692.0, "logps/rejected": -1144.0, "loss": 0.2781, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.21875, "rewards/margins": 4.625, "rewards/rejected": -9.875, "step": 16000 }, { "epoch": 0.6351348669193975, "eval_logits/chosen": -2.828125, "eval_logits/rejected": -3.046875, "eval_logps/chosen": -724.0, "eval_logps/rejected": -1104.0, "eval_loss": 0.24628451466560364, "eval_rewards/accuracies": 0.8946105241775513, "eval_rewards/chosen": -5.40625, "eval_rewards/margins": 4.0, "eval_rewards/rejected": -9.4375, "eval_runtime": 5413.9749, "eval_samples_per_second": 32.626, "eval_steps_per_second": 0.51, "step": 16000 }, { "epoch": 0.6355318262112221, "grad_norm": 27.32776250531351, "learning_rate": 1.764649196699479e-07, "logits/chosen": -2.875, "logits/rejected": -3.140625, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.2098, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.0, "rewards/rejected": -10.125, "step": 16010 }, { "epoch": 0.6359287855030467, "grad_norm": 24.492903562765054, "learning_rate": 1.7613388283493885e-07, "logits/chosen": -2.609375, "logits/rejected": -2.9375, "logps/chosen": -680.0, "logps/rejected": -1128.0, "loss": 0.25, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0, "rewards/margins": 4.625, "rewards/rejected": -9.625, "step": 16020 }, { "epoch": 0.6363257447948713, "grad_norm": 32.45176766904763, "learning_rate": 1.7580298784152036e-07, "logits/chosen": -2.75, "logits/rejected": -3.0625, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.3111, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.3125, "rewards/rejected": -10.0, "step": 16030 }, { "epoch": 0.636722704086696, "grad_norm": 29.730283189848173, "learning_rate": 1.754722353250942e-07, "logits/chosen": -2.8125, "logits/rejected": -3.125, "logps/chosen": -660.0, "logps/rejected": -1112.0, "loss": 0.283, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 4.53125, "rewards/rejected": -9.5, "step": 16040 }, { "epoch": 0.6371196633785206, "grad_norm": 31.201342722446128, "learning_rate": 1.751416259207889e-07, "logits/chosen": -2.671875, "logits/rejected": -2.703125, "logps/chosen": -680.0, "logps/rejected": -1120.0, "loss": 0.2343, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.125, "rewards/margins": 4.375, "rewards/rejected": -9.5, "step": 16050 }, { "epoch": 0.6375166226703451, "grad_norm": 27.169334585421236, "learning_rate": 1.7481116026345797e-07, "logits/chosen": -2.8125, "logits/rejected": -3.03125, "logps/chosen": -676.0, "logps/rejected": -1152.0, "loss": 0.2104, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.96875, "rewards/rejected": -10.0, "step": 16060 }, { "epoch": 0.6379135819621697, "grad_norm": 26.466188992590958, "learning_rate": 1.744808389876787e-07, "logits/chosen": -2.703125, "logits/rejected": -2.96875, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2481, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 16070 }, { "epoch": 0.6383105412539944, "grad_norm": 29.575102221147656, "learning_rate": 1.741506627277515e-07, "logits/chosen": -2.71875, "logits/rejected": -2.90625, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2954, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.78125, "rewards/margins": 4.9375, "rewards/rejected": -9.6875, "step": 16080 }, { "epoch": 0.638707500545819, "grad_norm": 17.51726218473338, "learning_rate": 1.7382063211769792e-07, "logits/chosen": -2.71875, "logits/rejected": -2.984375, "logps/chosen": -636.0, "logps/rejected": -1184.0, "loss": 0.2205, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.5, "rewards/margins": 5.75, "rewards/rejected": -10.25, "step": 16090 }, { "epoch": 0.6391044598376436, "grad_norm": 28.402759179640668, "learning_rate": 1.7349074779126028e-07, "logits/chosen": -2.78125, "logits/rejected": -3.28125, "logps/chosen": -672.0, "logps/rejected": -1160.0, "loss": 0.2297, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 16100 }, { "epoch": 0.6395014191294682, "grad_norm": 33.952579216851476, "learning_rate": 1.731610103818997e-07, "logits/chosen": -2.8125, "logits/rejected": -2.984375, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.2266, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.21875, "rewards/rejected": -10.25, "step": 16110 }, { "epoch": 0.6398983784212929, "grad_norm": 30.416639398410066, "learning_rate": 1.7283142052279512e-07, "logits/chosen": -2.796875, "logits/rejected": -2.96875, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.2508, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 4.8125, "rewards/rejected": -9.9375, "step": 16120 }, { "epoch": 0.6402953377131175, "grad_norm": 32.64083989206097, "learning_rate": 1.7250197884684238e-07, "logits/chosen": -2.703125, "logits/rejected": -3.046875, "logps/chosen": -640.0, "logps/rejected": -1112.0, "loss": 0.2714, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.65625, "rewards/margins": 4.90625, "rewards/rejected": -9.5625, "step": 16130 }, { "epoch": 0.6406922970049421, "grad_norm": 28.566487161000527, "learning_rate": 1.7217268598665255e-07, "logits/chosen": -2.859375, "logits/rejected": -3.03125, "logps/chosen": -688.0, "logps/rejected": -1216.0, "loss": 0.271, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 5.3125, "rewards/rejected": -10.3125, "step": 16140 }, { "epoch": 0.6410892562967667, "grad_norm": 38.81451072618053, "learning_rate": 1.7184354257455115e-07, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -660.0, "logps/rejected": -1120.0, "loss": 0.2794, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.71875, "rewards/rejected": -9.6875, "step": 16150 }, { "epoch": 0.6414862155885914, "grad_norm": 26.883558672472443, "learning_rate": 1.7151454924257665e-07, "logits/chosen": -2.875, "logits/rejected": -3.0, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.2506, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.78125, "rewards/rejected": -9.875, "step": 16160 }, { "epoch": 0.641883174880416, "grad_norm": 23.581583903830452, "learning_rate": 1.7118570662247911e-07, "logits/chosen": -2.875, "logits/rejected": -2.953125, "logps/chosen": -696.0, "logps/rejected": -1176.0, "loss": 0.2779, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.125, "rewards/margins": 4.90625, "rewards/rejected": -10.0, "step": 16170 }, { "epoch": 0.6422801341722406, "grad_norm": 29.970818541247386, "learning_rate": 1.7085701534571955e-07, "logits/chosen": -2.890625, "logits/rejected": -2.859375, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.2973, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 4.78125, "rewards/rejected": -9.9375, "step": 16180 }, { "epoch": 0.6426770934640652, "grad_norm": 29.707401925312485, "learning_rate": 1.705284760434681e-07, "logits/chosen": -2.90625, "logits/rejected": -3.171875, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.2732, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 5.34375, "rewards/rejected": -10.3125, "step": 16190 }, { "epoch": 0.6430740527558899, "grad_norm": 26.874604729114992, "learning_rate": 1.7020008934660308e-07, "logits/chosen": -2.8125, "logits/rejected": -2.96875, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.2535, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 4.6875, "rewards/rejected": -9.5625, "step": 16200 }, { "epoch": 0.6434710120477145, "grad_norm": 23.099410577317844, "learning_rate": 1.6987185588571e-07, "logits/chosen": -2.828125, "logits/rejected": -3.109375, "logps/chosen": -688.0, "logps/rejected": -1128.0, "loss": 0.2781, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.5, "rewards/rejected": -9.5625, "step": 16210 }, { "epoch": 0.6438679713395391, "grad_norm": 16.91977715695345, "learning_rate": 1.6954377629107994e-07, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2515, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 5.3125, "rewards/rejected": -10.3125, "step": 16220 }, { "epoch": 0.6442649306313638, "grad_norm": 40.695256491340544, "learning_rate": 1.6921585119270843e-07, "logits/chosen": -2.90625, "logits/rejected": -3.109375, "logps/chosen": -656.0, "logps/rejected": -1120.0, "loss": 0.2579, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.6875, "rewards/rejected": -9.5625, "step": 16230 }, { "epoch": 0.6446618899231884, "grad_norm": 26.508215374831508, "learning_rate": 1.6888808122029455e-07, "logits/chosen": -2.734375, "logits/rejected": -2.859375, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.2473, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 5.25, "rewards/rejected": -10.0, "step": 16240 }, { "epoch": 0.645058849215013, "grad_norm": 31.864504344056783, "learning_rate": 1.6856046700323923e-07, "logits/chosen": -2.578125, "logits/rejected": -2.78125, "logps/chosen": -624.0, "logps/rejected": -1112.0, "loss": 0.2434, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.53125, "rewards/margins": 4.9375, "rewards/rejected": -9.5, "step": 16250 }, { "epoch": 0.6454558085068376, "grad_norm": 37.777676398653625, "learning_rate": 1.6823300917064458e-07, "logits/chosen": -2.625, "logits/rejected": -2.9375, "logps/chosen": -680.0, "logps/rejected": -1144.0, "loss": 0.2518, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 4.625, "rewards/rejected": -9.625, "step": 16260 }, { "epoch": 0.6458527677986623, "grad_norm": 22.198131465584538, "learning_rate": 1.6790570835131234e-07, "logits/chosen": -2.765625, "logits/rejected": -2.984375, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2444, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.15625, "rewards/rejected": -10.0625, "step": 16270 }, { "epoch": 0.6462497270904869, "grad_norm": 28.669375249367615, "learning_rate": 1.6757856517374258e-07, "logits/chosen": -2.59375, "logits/rejected": -2.796875, "logps/chosen": -704.0, "logps/rejected": -1152.0, "loss": 0.2431, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 4.84375, "rewards/rejected": -9.9375, "step": 16280 }, { "epoch": 0.6466466863823115, "grad_norm": 27.816435832137163, "learning_rate": 1.6725158026613291e-07, "logits/chosen": -2.859375, "logits/rejected": -3.015625, "logps/chosen": -672.0, "logps/rejected": -1208.0, "loss": 0.2439, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 5.15625, "rewards/rejected": -10.25, "step": 16290 }, { "epoch": 0.6470436456741361, "grad_norm": 31.696693334569527, "learning_rate": 1.669247542563767e-07, "logits/chosen": -2.71875, "logits/rejected": -2.875, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2876, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.3125, "rewards/rejected": -10.125, "step": 16300 }, { "epoch": 0.6474406049659608, "grad_norm": 23.28641623168192, "learning_rate": 1.6659808777206258e-07, "logits/chosen": -2.75, "logits/rejected": -2.90625, "logps/chosen": -700.0, "logps/rejected": -1104.0, "loss": 0.2574, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.34375, "rewards/rejected": -9.4375, "step": 16310 }, { "epoch": 0.6478375642577854, "grad_norm": 27.928232217512164, "learning_rate": 1.6627158144047263e-07, "logits/chosen": -2.75, "logits/rejected": -2.875, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.2825, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.65625, "rewards/rejected": -9.625, "step": 16320 }, { "epoch": 0.64823452354961, "grad_norm": 35.7952369862327, "learning_rate": 1.6594523588858134e-07, "logits/chosen": -2.71875, "logits/rejected": -2.828125, "logps/chosen": -668.0, "logps/rejected": -1128.0, "loss": 0.2553, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 4.84375, "rewards/rejected": -9.6875, "step": 16330 }, { "epoch": 0.6486314828414346, "grad_norm": 35.36940438472878, "learning_rate": 1.656190517430546e-07, "logits/chosen": -2.6875, "logits/rejected": -2.921875, "logps/chosen": -672.0, "logps/rejected": -1112.0, "loss": 0.2339, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.5625, "rewards/rejected": -9.4375, "step": 16340 }, { "epoch": 0.6490284421332593, "grad_norm": 27.18496522809295, "learning_rate": 1.6529302963024815e-07, "logits/chosen": -2.71875, "logits/rejected": -2.984375, "logps/chosen": -624.0, "logps/rejected": -1064.0, "loss": 0.244, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.65625, "rewards/margins": 4.46875, "rewards/rejected": -9.125, "step": 16350 }, { "epoch": 0.6494254014250839, "grad_norm": 26.45854282517819, "learning_rate": 1.6496717017620708e-07, "logits/chosen": -2.625, "logits/rejected": -2.890625, "logps/chosen": -640.0, "logps/rejected": -1096.0, "loss": 0.2347, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.75, "rewards/margins": 4.59375, "rewards/rejected": -9.3125, "step": 16360 }, { "epoch": 0.6498223607169085, "grad_norm": 31.52751210964204, "learning_rate": 1.646414740066635e-07, "logits/chosen": -2.71875, "logits/rejected": -2.96875, "logps/chosen": -676.0, "logps/rejected": -1160.0, "loss": 0.2391, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 16370 }, { "epoch": 0.6502193200087331, "grad_norm": 23.428875252662728, "learning_rate": 1.6431594174703644e-07, "logits/chosen": -2.671875, "logits/rejected": -2.875, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.2305, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.25, "rewards/rejected": -10.0625, "step": 16380 }, { "epoch": 0.6506162793005578, "grad_norm": 30.149553540720085, "learning_rate": 1.6399057402242987e-07, "logits/chosen": -2.71875, "logits/rejected": -2.84375, "logps/chosen": -704.0, "logps/rejected": -1168.0, "loss": 0.2473, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 4.84375, "rewards/rejected": -10.0, "step": 16390 }, { "epoch": 0.6510132385923824, "grad_norm": 27.3073797023043, "learning_rate": 1.6366537145763205e-07, "logits/chosen": -2.625, "logits/rejected": -2.71875, "logps/chosen": -688.0, "logps/rejected": -1176.0, "loss": 0.2702, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.125, "rewards/rejected": -10.125, "step": 16400 }, { "epoch": 0.651410197884207, "grad_norm": 27.248342012928276, "learning_rate": 1.63340334677114e-07, "logits/chosen": -2.765625, "logits/rejected": -2.859375, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.2492, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 16410 }, { "epoch": 0.6518071571760315, "grad_norm": 32.36845542591727, "learning_rate": 1.6301546430502834e-07, "logits/chosen": -2.765625, "logits/rejected": -2.90625, "logps/chosen": -688.0, "logps/rejected": -1128.0, "loss": 0.2571, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.53125, "rewards/rejected": -9.5625, "step": 16420 }, { "epoch": 0.6522041164678563, "grad_norm": 31.2690703786152, "learning_rate": 1.626907609652082e-07, "logits/chosen": -2.765625, "logits/rejected": -2.875, "logps/chosen": -696.0, "logps/rejected": -1184.0, "loss": 0.234, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 5.03125, "rewards/rejected": -10.125, "step": 16430 }, { "epoch": 0.6526010757596808, "grad_norm": 29.956349194308945, "learning_rate": 1.6236622528116595e-07, "logits/chosen": -2.8125, "logits/rejected": -2.921875, "logps/chosen": -696.0, "logps/rejected": -1192.0, "loss": 0.262, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.21875, "rewards/margins": 4.9375, "rewards/rejected": -10.125, "step": 16440 }, { "epoch": 0.6529980350515054, "grad_norm": 20.05161447600574, "learning_rate": 1.6204185787609195e-07, "logits/chosen": -2.90625, "logits/rejected": -2.96875, "logps/chosen": -672.0, "logps/rejected": -1240.0, "loss": 0.2029, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 5.5, "rewards/rejected": -10.5625, "step": 16450 }, { "epoch": 0.65339499434333, "grad_norm": 21.073935152390636, "learning_rate": 1.6171765937285369e-07, "logits/chosen": -2.875, "logits/rejected": -2.90625, "logps/chosen": -696.0, "logps/rejected": -1160.0, "loss": 0.2361, "rewards/accuracies": 0.9375, "rewards/chosen": -5.25, "rewards/margins": 4.4375, "rewards/rejected": -9.75, "step": 16460 }, { "epoch": 0.6537919536351547, "grad_norm": 29.921238096111065, "learning_rate": 1.6139363039399402e-07, "logits/chosen": -2.796875, "logits/rejected": -3.046875, "logps/chosen": -696.0, "logps/rejected": -1160.0, "loss": 0.2685, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.71875, "rewards/rejected": -9.8125, "step": 16470 }, { "epoch": 0.6541889129269793, "grad_norm": 31.9662377937305, "learning_rate": 1.6106977156173035e-07, "logits/chosen": -2.765625, "logits/rejected": -2.96875, "logps/chosen": -724.0, "logps/rejected": -1160.0, "loss": 0.3044, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.375, "rewards/margins": 4.40625, "rewards/rejected": -9.8125, "step": 16480 }, { "epoch": 0.6545858722188039, "grad_norm": 28.42389474362004, "learning_rate": 1.6074608349795337e-07, "logits/chosen": -2.765625, "logits/rejected": -3.109375, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.2344, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 4.875, "rewards/rejected": -9.6875, "step": 16490 }, { "epoch": 0.6549828315106286, "grad_norm": 23.15901444434715, "learning_rate": 1.60422566824226e-07, "logits/chosen": -2.796875, "logits/rejected": -3.015625, "logps/chosen": -668.0, "logps/rejected": -1120.0, "loss": 0.2226, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.625, "rewards/rejected": -9.5625, "step": 16500 }, { "epoch": 0.6553797908024532, "grad_norm": 20.309576547351956, "learning_rate": 1.6009922216178167e-07, "logits/chosen": -2.734375, "logits/rejected": -2.9375, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.241, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.875, "rewards/margins": 4.84375, "rewards/rejected": -9.75, "step": 16510 }, { "epoch": 0.6557767500942778, "grad_norm": 23.960479426436436, "learning_rate": 1.59776050131524e-07, "logits/chosen": -2.828125, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1128.0, "loss": 0.2681, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.84375, "rewards/rejected": -9.6875, "step": 16520 }, { "epoch": 0.6561737093861024, "grad_norm": 31.3890809253685, "learning_rate": 1.5945305135402477e-07, "logits/chosen": -2.6875, "logits/rejected": -2.921875, "logps/chosen": -716.0, "logps/rejected": -1208.0, "loss": 0.2522, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.34375, "rewards/margins": 4.90625, "rewards/rejected": -10.25, "step": 16530 }, { "epoch": 0.6565706686779271, "grad_norm": 28.900315715918754, "learning_rate": 1.591302264495232e-07, "logits/chosen": -2.765625, "logits/rejected": -2.921875, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.2397, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.09375, "rewards/margins": 4.6875, "rewards/rejected": -9.8125, "step": 16540 }, { "epoch": 0.6569676279697517, "grad_norm": 22.149778222330696, "learning_rate": 1.5880757603792452e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0625, "logps/chosen": -692.0, "logps/rejected": -1120.0, "loss": 0.2718, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3125, "rewards/margins": 4.40625, "rewards/rejected": -9.75, "step": 16550 }, { "epoch": 0.6573645872615763, "grad_norm": 22.541836161783344, "learning_rate": 1.5848510073879894e-07, "logits/chosen": -2.703125, "logits/rejected": -3.03125, "logps/chosen": -688.0, "logps/rejected": -1184.0, "loss": 0.2735, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.25, "rewards/rejected": -10.0625, "step": 16560 }, { "epoch": 0.6577615465534009, "grad_norm": 19.385295897927133, "learning_rate": 1.5816280117138066e-07, "logits/chosen": -2.75, "logits/rejected": -3.0, "logps/chosen": -680.0, "logps/rejected": -1104.0, "loss": 0.2539, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.34375, "rewards/rejected": -9.375, "step": 16570 }, { "epoch": 0.6581585058452256, "grad_norm": 40.91056818307048, "learning_rate": 1.5784067795456593e-07, "logits/chosen": -2.78125, "logits/rejected": -2.890625, "logps/chosen": -672.0, "logps/rejected": -1128.0, "loss": 0.2673, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.78125, "rewards/rejected": -9.6875, "step": 16580 }, { "epoch": 0.6585554651370502, "grad_norm": 21.148951360596495, "learning_rate": 1.575187317069128e-07, "logits/chosen": -2.765625, "logits/rejected": -2.890625, "logps/chosen": -696.0, "logps/rejected": -1168.0, "loss": 0.2557, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.84375, "rewards/rejected": -10.0, "step": 16590 }, { "epoch": 0.6589524244288748, "grad_norm": 21.28003806014401, "learning_rate": 1.571969630466393e-07, "logits/chosen": -2.78125, "logits/rejected": -3.0, "logps/chosen": -680.0, "logps/rejected": -1184.0, "loss": 0.2658, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.15625, "rewards/rejected": -10.1875, "step": 16600 }, { "epoch": 0.6593493837206994, "grad_norm": 26.74566696592372, "learning_rate": 1.5687537259162237e-07, "logits/chosen": -2.84375, "logits/rejected": -2.859375, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2465, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.125, "rewards/margins": 5.09375, "rewards/rejected": -10.25, "step": 16610 }, { "epoch": 0.6597463430125241, "grad_norm": 32.062959049808136, "learning_rate": 1.565539609593971e-07, "logits/chosen": -2.84375, "logits/rejected": -3.03125, "logps/chosen": -656.0, "logps/rejected": -1176.0, "loss": 0.2471, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 16620 }, { "epoch": 0.6601433023043487, "grad_norm": 32.32059992583944, "learning_rate": 1.562327287671547e-07, "logits/chosen": -2.765625, "logits/rejected": -2.921875, "logps/chosen": -688.0, "logps/rejected": -1152.0, "loss": 0.2753, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.875, "rewards/rejected": -9.8125, "step": 16630 }, { "epoch": 0.6605402615961733, "grad_norm": 34.269009873013154, "learning_rate": 1.5591167663174228e-07, "logits/chosen": -2.828125, "logits/rejected": -2.90625, "logps/chosen": -672.0, "logps/rejected": -1136.0, "loss": 0.2678, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.65625, "rewards/rejected": -9.625, "step": 16640 }, { "epoch": 0.6609372208879979, "grad_norm": 28.799238506628985, "learning_rate": 1.5559080516966085e-07, "logits/chosen": -2.75, "logits/rejected": -2.84375, "logps/chosen": -684.0, "logps/rejected": -1200.0, "loss": 0.221, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 16650 }, { "epoch": 0.6613341801798226, "grad_norm": 25.603048344293864, "learning_rate": 1.5527011499706456e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0625, "logps/chosen": -660.0, "logps/rejected": -1120.0, "loss": 0.2803, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.6875, "rewards/rejected": -9.75, "step": 16660 }, { "epoch": 0.6617311394716472, "grad_norm": 24.0873945828608, "learning_rate": 1.5494960672975967e-07, "logits/chosen": -2.984375, "logits/rejected": -3.046875, "logps/chosen": -672.0, "logps/rejected": -1216.0, "loss": 0.264, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 5.25, "rewards/rejected": -10.3125, "step": 16670 }, { "epoch": 0.6621280987634718, "grad_norm": 18.527407877050713, "learning_rate": 1.5462928098320282e-07, "logits/chosen": -2.765625, "logits/rejected": -2.9375, "logps/chosen": -692.0, "logps/rejected": -1144.0, "loss": 0.2411, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.09375, "rewards/margins": 4.75, "rewards/rejected": -9.875, "step": 16680 }, { "epoch": 0.6625250580552964, "grad_norm": 24.497721443426663, "learning_rate": 1.5430913837250043e-07, "logits/chosen": -2.8125, "logits/rejected": -3.03125, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2236, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 16690 }, { "epoch": 0.6629220173471211, "grad_norm": 32.36755226581034, "learning_rate": 1.5398917951240695e-07, "logits/chosen": -2.859375, "logits/rejected": -3.09375, "logps/chosen": -644.0, "logps/rejected": -1112.0, "loss": 0.2767, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 4.78125, "rewards/rejected": -9.6875, "step": 16700 }, { "epoch": 0.6633189766389457, "grad_norm": 25.066409747119057, "learning_rate": 1.536694050173242e-07, "logits/chosen": -2.890625, "logits/rejected": -3.140625, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.2542, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.21875, "rewards/rejected": -9.875, "step": 16710 }, { "epoch": 0.6637159359307703, "grad_norm": 36.0794001687815, "learning_rate": 1.5334981550130004e-07, "logits/chosen": -2.71875, "logits/rejected": -2.796875, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2399, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.875, "rewards/rejected": -9.8125, "step": 16720 }, { "epoch": 0.6641128952225949, "grad_norm": 31.29039376718535, "learning_rate": 1.5303041157802698e-07, "logits/chosen": -2.71875, "logits/rejected": -2.90625, "logps/chosen": -684.0, "logps/rejected": -1184.0, "loss": 0.2314, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 16730 }, { "epoch": 0.6645098545144196, "grad_norm": 36.58472300114473, "learning_rate": 1.5271119386084108e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0625, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.2543, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 4.75, "rewards/rejected": -9.75, "step": 16740 }, { "epoch": 0.6649068138062442, "grad_norm": 22.04988673311834, "learning_rate": 1.523921629627211e-07, "logits/chosen": -2.796875, "logits/rejected": -2.9375, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2331, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0, "rewards/margins": 4.90625, "rewards/rejected": -9.9375, "step": 16750 }, { "epoch": 0.6653037730980688, "grad_norm": 32.01433072968656, "learning_rate": 1.5207331949628672e-07, "logits/chosen": -2.875, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.2443, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 16760 }, { "epoch": 0.6657007323898935, "grad_norm": 15.707989731809237, "learning_rate": 1.517546640737979e-07, "logits/chosen": -2.890625, "logits/rejected": -2.921875, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.2765, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.8125, "rewards/rejected": -9.875, "step": 16770 }, { "epoch": 0.6660976916817181, "grad_norm": 21.79355208174277, "learning_rate": 1.5143619730715364e-07, "logits/chosen": -2.796875, "logits/rejected": -3.109375, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.2639, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.0, "rewards/rejected": -9.75, "step": 16780 }, { "epoch": 0.6664946509735427, "grad_norm": 27.497524329263197, "learning_rate": 1.5111791980789038e-07, "logits/chosen": -2.71875, "logits/rejected": -3.234375, "logps/chosen": -640.0, "logps/rejected": -1104.0, "loss": 0.2506, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.75, "rewards/margins": 4.78125, "rewards/rejected": -9.5625, "step": 16790 }, { "epoch": 0.6668916102653673, "grad_norm": 34.42967298267666, "learning_rate": 1.5079983218718134e-07, "logits/chosen": -2.609375, "logits/rejected": -2.609375, "logps/chosen": -712.0, "logps/rejected": -1184.0, "loss": 0.2278, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.3125, "rewards/margins": 4.78125, "rewards/rejected": -10.125, "step": 16800 }, { "epoch": 0.667288569557192, "grad_norm": 30.445039498069622, "learning_rate": 1.5048193505583497e-07, "logits/chosen": -2.6875, "logits/rejected": -2.96875, "logps/chosen": -636.0, "logps/rejected": -1152.0, "loss": 0.2677, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.625, "rewards/margins": 5.15625, "rewards/rejected": -9.75, "step": 16810 }, { "epoch": 0.6676855288490166, "grad_norm": 19.96988645862788, "learning_rate": 1.5016422902429402e-07, "logits/chosen": -2.796875, "logits/rejected": -3.09375, "logps/chosen": -680.0, "logps/rejected": -1104.0, "loss": 0.2713, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.5, "rewards/rejected": -9.5625, "step": 16820 }, { "epoch": 0.6680824881408411, "grad_norm": 15.031435734894917, "learning_rate": 1.4984671470263434e-07, "logits/chosen": -2.75, "logits/rejected": -3.046875, "logps/chosen": -688.0, "logps/rejected": -1176.0, "loss": 0.2524, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.96875, "rewards/rejected": -10.0, "step": 16830 }, { "epoch": 0.6684794474326657, "grad_norm": 32.304685405754334, "learning_rate": 1.4952939270056354e-07, "logits/chosen": -2.609375, "logits/rejected": -2.71875, "logps/chosen": -680.0, "logps/rejected": -1128.0, "loss": 0.2716, "rewards/accuracies": 0.9375, "rewards/chosen": -4.71875, "rewards/margins": 4.65625, "rewards/rejected": -9.375, "step": 16840 }, { "epoch": 0.6688764067244904, "grad_norm": 26.253575298223147, "learning_rate": 1.4921226362741996e-07, "logits/chosen": -2.6875, "logits/rejected": -2.734375, "logps/chosen": -636.0, "logps/rejected": -1128.0, "loss": 0.2498, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.5, "rewards/margins": 5.15625, "rewards/rejected": -9.6875, "step": 16850 }, { "epoch": 0.669273366016315, "grad_norm": 29.444664277315646, "learning_rate": 1.4889532809217143e-07, "logits/chosen": -2.8125, "logits/rejected": -3.109375, "logps/chosen": -648.0, "logps/rejected": -1104.0, "loss": 0.2272, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.71875, "rewards/margins": 4.59375, "rewards/rejected": -9.3125, "step": 16860 }, { "epoch": 0.6696703253081396, "grad_norm": 21.874335770723697, "learning_rate": 1.4857858670341415e-07, "logits/chosen": -2.640625, "logits/rejected": -2.890625, "logps/chosen": -684.0, "logps/rejected": -1192.0, "loss": 0.231, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.9375, "rewards/rejected": -9.9375, "step": 16870 }, { "epoch": 0.6700672845999642, "grad_norm": 30.01481534327102, "learning_rate": 1.4826204006937165e-07, "logits/chosen": -2.921875, "logits/rejected": -3.046875, "logps/chosen": -684.0, "logps/rejected": -1144.0, "loss": 0.2693, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 4.65625, "rewards/rejected": -9.8125, "step": 16880 }, { "epoch": 0.6704642438917889, "grad_norm": 36.27014571222068, "learning_rate": 1.4794568879789336e-07, "logits/chosen": -2.84375, "logits/rejected": -3.234375, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.278, "rewards/accuracies": 0.9375, "rewards/chosen": -5.125, "rewards/margins": 4.8125, "rewards/rejected": -9.9375, "step": 16890 }, { "epoch": 0.6708612031836135, "grad_norm": 43.05146046822001, "learning_rate": 1.476295334964535e-07, "logits/chosen": -2.75, "logits/rejected": -3.0625, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.2428, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.15625, "rewards/rejected": -9.875, "step": 16900 }, { "epoch": 0.6712581624754381, "grad_norm": 38.50690368531324, "learning_rate": 1.4731357477215016e-07, "logits/chosen": -2.765625, "logits/rejected": -2.8125, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.2827, "rewards/accuracies": 0.9375, "rewards/chosen": -4.78125, "rewards/margins": 5.0, "rewards/rejected": -9.75, "step": 16910 }, { "epoch": 0.6716551217672627, "grad_norm": 23.14908454171815, "learning_rate": 1.4699781323170368e-07, "logits/chosen": -2.671875, "logits/rejected": -3.03125, "logps/chosen": -712.0, "logps/rejected": -1136.0, "loss": 0.2632, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.25, "rewards/margins": 4.46875, "rewards/rejected": -9.6875, "step": 16920 }, { "epoch": 0.6720520810590874, "grad_norm": 20.107325301993626, "learning_rate": 1.466822494814561e-07, "logits/chosen": -2.796875, "logits/rejected": -3.078125, "logps/chosen": -644.0, "logps/rejected": -1112.0, "loss": 0.2194, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.65625, "rewards/rejected": -9.5, "step": 16930 }, { "epoch": 0.672449040350912, "grad_norm": 28.035415272018625, "learning_rate": 1.4636688412736938e-07, "logits/chosen": -2.890625, "logits/rejected": -3.109375, "logps/chosen": -664.0, "logps/rejected": -1120.0, "loss": 0.2297, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.6875, "rewards/rejected": -9.6875, "step": 16940 }, { "epoch": 0.6728459996427366, "grad_norm": 30.6508296898807, "learning_rate": 1.460517177750246e-07, "logits/chosen": -2.828125, "logits/rejected": -2.90625, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.227, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.96875, "rewards/rejected": -9.9375, "step": 16950 }, { "epoch": 0.6732429589345612, "grad_norm": 24.61973243733961, "learning_rate": 1.457367510296208e-07, "logits/chosen": -2.859375, "logits/rejected": -3.03125, "logps/chosen": -672.0, "logps/rejected": -1224.0, "loss": 0.2484, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.09375, "rewards/margins": 5.3125, "rewards/rejected": -10.4375, "step": 16960 }, { "epoch": 0.6736399182263859, "grad_norm": 20.348614423419377, "learning_rate": 1.4542198449597348e-07, "logits/chosen": -2.90625, "logits/rejected": -3.015625, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.2522, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.25, "rewards/margins": 4.71875, "rewards/rejected": -9.9375, "step": 16970 }, { "epoch": 0.6740368775182105, "grad_norm": 29.959265992423624, "learning_rate": 1.4510741877851403e-07, "logits/chosen": -2.75, "logits/rejected": -2.890625, "logps/chosen": -700.0, "logps/rejected": -1200.0, "loss": 0.2514, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 4.8125, "rewards/rejected": -10.0, "step": 16980 }, { "epoch": 0.6744338368100351, "grad_norm": 30.775889932694838, "learning_rate": 1.447930544812878e-07, "logits/chosen": -2.84375, "logits/rejected": -3.03125, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.2365, "rewards/accuracies": 0.96875, "rewards/chosen": -4.84375, "rewards/margins": 5.0625, "rewards/rejected": -9.9375, "step": 16990 }, { "epoch": 0.6748307961018598, "grad_norm": 29.30966777823894, "learning_rate": 1.444788922079539e-07, "logits/chosen": -2.6875, "logits/rejected": -2.84375, "logps/chosen": -692.0, "logps/rejected": -1152.0, "loss": 0.2562, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 4.75, "rewards/rejected": -9.75, "step": 17000 }, { "epoch": 0.6748307961018598, "eval_logits/chosen": -2.75, "eval_logits/rejected": -2.953125, "eval_logps/chosen": -688.0, "eval_logps/rejected": -1064.0, "eval_loss": 0.24750015139579773, "eval_rewards/accuracies": 0.8939764499664307, "eval_rewards/chosen": -5.03125, "eval_rewards/margins": 3.953125, "eval_rewards/rejected": -9.0, "eval_runtime": 5416.3606, "eval_samples_per_second": 32.612, "eval_steps_per_second": 0.51, "step": 17000 }, { "epoch": 0.6752277553936844, "grad_norm": 25.267533600596813, "learning_rate": 1.4416493256178297e-07, "logits/chosen": -2.734375, "logits/rejected": -2.90625, "logps/chosen": -676.0, "logps/rejected": -1120.0, "loss": 0.2588, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.46875, "rewards/rejected": -9.4375, "step": 17010 }, { "epoch": 0.675624714685509, "grad_norm": 25.353184532498364, "learning_rate": 1.4385117614565683e-07, "logits/chosen": -2.6875, "logits/rejected": -2.828125, "logps/chosen": -668.0, "logps/rejected": -1160.0, "loss": 0.2147, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 4.9375, "rewards/rejected": -9.75, "step": 17020 }, { "epoch": 0.6760216739773336, "grad_norm": 26.63930702499129, "learning_rate": 1.4353762356206696e-07, "logits/chosen": -2.765625, "logits/rejected": -2.859375, "logps/chosen": -684.0, "logps/rejected": -1200.0, "loss": 0.2725, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 17030 }, { "epoch": 0.6764186332691583, "grad_norm": 25.009837249247617, "learning_rate": 1.4322427541311346e-07, "logits/chosen": -2.765625, "logits/rejected": -2.765625, "logps/chosen": -640.0, "logps/rejected": -1152.0, "loss": 0.2355, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 4.96875, "rewards/rejected": -9.625, "step": 17040 }, { "epoch": 0.6768155925609829, "grad_norm": 27.02884775496028, "learning_rate": 1.4291113230050405e-07, "logits/chosen": -2.75, "logits/rejected": -3.0, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.2893, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0, "rewards/margins": 5.0625, "rewards/rejected": -10.0625, "step": 17050 }, { "epoch": 0.6772125518528075, "grad_norm": 28.43304806662623, "learning_rate": 1.425981948255524e-07, "logits/chosen": -2.703125, "logits/rejected": -2.796875, "logps/chosen": -636.0, "logps/rejected": -1176.0, "loss": 0.2027, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 5.125, "rewards/rejected": -9.875, "step": 17060 }, { "epoch": 0.6776095111446321, "grad_norm": 26.754485210623674, "learning_rate": 1.4228546358917742e-07, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -692.0, "logps/rejected": -1168.0, "loss": 0.2361, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.0625, "rewards/rejected": -10.0625, "step": 17070 }, { "epoch": 0.6780064704364568, "grad_norm": 28.0753373640099, "learning_rate": 1.4197293919190205e-07, "logits/chosen": -2.8125, "logits/rejected": -2.796875, "logps/chosen": -664.0, "logps/rejected": -1176.0, "loss": 0.2514, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.96875, "rewards/rejected": -10.0, "step": 17080 }, { "epoch": 0.6784034297282814, "grad_norm": 28.355729714661752, "learning_rate": 1.416606222338522e-07, "logits/chosen": -2.75, "logits/rejected": -2.84375, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2629, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 4.78125, "rewards/rejected": -9.75, "step": 17090 }, { "epoch": 0.678800389020106, "grad_norm": 32.78788324076574, "learning_rate": 1.4134851331475512e-07, "logits/chosen": -2.890625, "logits/rejected": -2.96875, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.2356, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 17100 }, { "epoch": 0.6791973483119306, "grad_norm": 27.396500118570856, "learning_rate": 1.4103661303393872e-07, "logits/chosen": -2.78125, "logits/rejected": -2.9375, "logps/chosen": -644.0, "logps/rejected": -1200.0, "loss": 0.2394, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.65625, "rewards/margins": 5.46875, "rewards/rejected": -10.125, "step": 17110 }, { "epoch": 0.6795943076037553, "grad_norm": 29.728175487686165, "learning_rate": 1.4072492199033016e-07, "logits/chosen": -2.71875, "logits/rejected": -2.890625, "logps/chosen": -640.0, "logps/rejected": -1184.0, "loss": 0.2305, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 5.46875, "rewards/rejected": -10.1875, "step": 17120 }, { "epoch": 0.6799912668955799, "grad_norm": 29.22634851526502, "learning_rate": 1.4041344078245503e-07, "logits/chosen": -2.90625, "logits/rejected": -3.0, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.2145, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 4.875, "rewards/rejected": -9.75, "step": 17130 }, { "epoch": 0.6803882261874045, "grad_norm": 29.48245856037284, "learning_rate": 1.40102170008436e-07, "logits/chosen": -2.796875, "logits/rejected": -2.953125, "logps/chosen": -644.0, "logps/rejected": -1152.0, "loss": 0.2368, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 5.0625, "rewards/rejected": -9.8125, "step": 17140 }, { "epoch": 0.6807851854792291, "grad_norm": 36.131800622446626, "learning_rate": 1.397911102659914e-07, "logits/chosen": -2.8125, "logits/rejected": -3.140625, "logps/chosen": -696.0, "logps/rejected": -1168.0, "loss": 0.2736, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.9375, "rewards/rejected": -10.0, "step": 17150 }, { "epoch": 0.6811821447710538, "grad_norm": 24.770278511049987, "learning_rate": 1.394802621524344e-07, "logits/chosen": -2.734375, "logits/rejected": -2.96875, "logps/chosen": -640.0, "logps/rejected": -1184.0, "loss": 0.2693, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.5625, "rewards/margins": 5.46875, "rewards/rejected": -10.0, "step": 17160 }, { "epoch": 0.6815791040628784, "grad_norm": 23.330537321264536, "learning_rate": 1.3916962626467187e-07, "logits/chosen": -2.75, "logits/rejected": -3.0, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.2841, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.15625, "rewards/margins": 4.625, "rewards/rejected": -9.8125, "step": 17170 }, { "epoch": 0.681976063354703, "grad_norm": 29.584422375760948, "learning_rate": 1.3885920319920318e-07, "logits/chosen": -2.765625, "logits/rejected": -2.78125, "logps/chosen": -640.0, "logps/rejected": -1128.0, "loss": 0.2338, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 4.71875, "rewards/rejected": -9.5, "step": 17180 }, { "epoch": 0.6823730226465275, "grad_norm": 25.670335088943315, "learning_rate": 1.3854899355211903e-07, "logits/chosen": -2.765625, "logits/rejected": -2.828125, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.2321, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.84375, "rewards/rejected": -9.8125, "step": 17190 }, { "epoch": 0.6827699819383523, "grad_norm": 23.81103608622012, "learning_rate": 1.382389979191002e-07, "logits/chosen": -2.953125, "logits/rejected": -3.46875, "logps/chosen": -660.0, "logps/rejected": -1104.0, "loss": 0.2054, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.65625, "rewards/rejected": -9.625, "step": 17200 }, { "epoch": 0.6831669412301768, "grad_norm": 28.630013526491656, "learning_rate": 1.3792921689541648e-07, "logits/chosen": -3.015625, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1120.0, "loss": 0.2575, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.71875, "rewards/rejected": -9.6875, "step": 17210 }, { "epoch": 0.6835639005220014, "grad_norm": 29.838647772889903, "learning_rate": 1.3761965107592561e-07, "logits/chosen": -2.859375, "logits/rejected": -3.125, "logps/chosen": -704.0, "logps/rejected": -1160.0, "loss": 0.214, "rewards/accuracies": 0.9375, "rewards/chosen": -5.25, "rewards/margins": 4.5625, "rewards/rejected": -9.8125, "step": 17220 }, { "epoch": 0.683960859813826, "grad_norm": 33.09328353073959, "learning_rate": 1.3731030105507219e-07, "logits/chosen": -2.765625, "logits/rejected": -2.90625, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.2286, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.8125, "rewards/rejected": -9.75, "step": 17230 }, { "epoch": 0.6843578191056507, "grad_norm": 26.507839102705237, "learning_rate": 1.370011674268864e-07, "logits/chosen": -2.6875, "logits/rejected": -2.984375, "logps/chosen": -676.0, "logps/rejected": -1152.0, "loss": 0.2492, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.8125, "rewards/rejected": -9.875, "step": 17240 }, { "epoch": 0.6847547783974753, "grad_norm": 34.21154057049875, "learning_rate": 1.3669225078498278e-07, "logits/chosen": -2.71875, "logits/rejected": -2.84375, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.2705, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.875, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 17250 }, { "epoch": 0.6851517376892999, "grad_norm": 23.363868357374557, "learning_rate": 1.3638355172255918e-07, "logits/chosen": -2.625, "logits/rejected": -2.671875, "logps/chosen": -660.0, "logps/rejected": -1232.0, "loss": 0.2063, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.46875, "rewards/rejected": -10.3125, "step": 17260 }, { "epoch": 0.6855486969811246, "grad_norm": 30.068610012178617, "learning_rate": 1.3607507083239587e-07, "logits/chosen": -2.765625, "logits/rejected": -3.078125, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2419, "rewards/accuracies": 0.96875, "rewards/chosen": -5.09375, "rewards/margins": 4.9375, "rewards/rejected": -10.0, "step": 17270 }, { "epoch": 0.6859456562729492, "grad_norm": 40.41707239550191, "learning_rate": 1.3576680870685386e-07, "logits/chosen": -2.734375, "logits/rejected": -2.78125, "logps/chosen": -668.0, "logps/rejected": -1184.0, "loss": 0.2502, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 17280 }, { "epoch": 0.6863426155647738, "grad_norm": 29.666156558511826, "learning_rate": 1.3545876593787442e-07, "logits/chosen": -2.859375, "logits/rejected": -2.96875, "logps/chosen": -672.0, "logps/rejected": -1176.0, "loss": 0.2271, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.875, "rewards/rejected": -9.875, "step": 17290 }, { "epoch": 0.6867395748565984, "grad_norm": 25.08197951932217, "learning_rate": 1.3515094311697736e-07, "logits/chosen": -2.734375, "logits/rejected": -2.953125, "logps/chosen": -640.0, "logps/rejected": -1128.0, "loss": 0.198, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 4.8125, "rewards/rejected": -9.5, "step": 17300 }, { "epoch": 0.6871365341484231, "grad_norm": 15.613547002655448, "learning_rate": 1.3484334083526006e-07, "logits/chosen": -2.6875, "logits/rejected": -2.921875, "logps/chosen": -620.0, "logps/rejected": -1152.0, "loss": 0.2342, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5625, "rewards/margins": 5.1875, "rewards/rejected": -9.75, "step": 17310 }, { "epoch": 0.6875334934402477, "grad_norm": 20.728139523703145, "learning_rate": 1.345359596833968e-07, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2116, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.25, "rewards/rejected": -10.1875, "step": 17320 }, { "epoch": 0.6879304527320723, "grad_norm": 30.088348378273015, "learning_rate": 1.342288002516368e-07, "logits/chosen": -2.796875, "logits/rejected": -3.046875, "logps/chosen": -644.0, "logps/rejected": -1136.0, "loss": 0.2384, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.75, "rewards/margins": 5.0, "rewards/rejected": -9.75, "step": 17330 }, { "epoch": 0.6883274120238969, "grad_norm": 30.141798139753956, "learning_rate": 1.339218631298037e-07, "logits/chosen": -2.875, "logits/rejected": -2.90625, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.2365, "rewards/accuracies": 0.9375, "rewards/chosen": -4.90625, "rewards/margins": 4.90625, "rewards/rejected": -9.8125, "step": 17340 }, { "epoch": 0.6887243713157216, "grad_norm": 32.465618462613, "learning_rate": 1.3361514890729437e-07, "logits/chosen": -2.671875, "logits/rejected": -2.9375, "logps/chosen": -636.0, "logps/rejected": -1152.0, "loss": 0.2164, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.5, "rewards/margins": 5.3125, "rewards/rejected": -9.8125, "step": 17350 }, { "epoch": 0.6891213306075462, "grad_norm": 37.85402401868134, "learning_rate": 1.3330865817307741e-07, "logits/chosen": -2.765625, "logits/rejected": -2.90625, "logps/chosen": -652.0, "logps/rejected": -1216.0, "loss": 0.2264, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.625, "rewards/rejected": -10.375, "step": 17360 }, { "epoch": 0.6895182898993708, "grad_norm": 38.43511652355203, "learning_rate": 1.330023915156925e-07, "logits/chosen": -2.703125, "logits/rejected": -2.90625, "logps/chosen": -672.0, "logps/rejected": -1120.0, "loss": 0.2645, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 4.53125, "rewards/rejected": -9.4375, "step": 17370 }, { "epoch": 0.6899152491911954, "grad_norm": 19.99335764267303, "learning_rate": 1.3269634952324892e-07, "logits/chosen": -2.71875, "logits/rejected": -2.8125, "logps/chosen": -640.0, "logps/rejected": -1192.0, "loss": 0.2396, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.5625, "rewards/margins": 5.46875, "rewards/rejected": -10.0625, "step": 17380 }, { "epoch": 0.6903122084830201, "grad_norm": 22.064569790717385, "learning_rate": 1.3239053278342443e-07, "logits/chosen": -2.765625, "logits/rejected": -3.046875, "logps/chosen": -728.0, "logps/rejected": -1160.0, "loss": 0.2404, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.28125, "rewards/margins": 4.65625, "rewards/rejected": -9.9375, "step": 17390 }, { "epoch": 0.6907091677748447, "grad_norm": 23.258113971402096, "learning_rate": 1.3208494188346442e-07, "logits/chosen": -2.8125, "logits/rejected": -2.921875, "logps/chosen": -708.0, "logps/rejected": -1208.0, "loss": 0.2011, "rewards/accuracies": 0.96875, "rewards/chosen": -5.21875, "rewards/margins": 5.0, "rewards/rejected": -10.1875, "step": 17400 }, { "epoch": 0.6911061270666693, "grad_norm": 28.152336079381886, "learning_rate": 1.317795774101807e-07, "logits/chosen": -2.796875, "logits/rejected": -3.265625, "logps/chosen": -656.0, "logps/rejected": -1176.0, "loss": 0.234, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 17410 }, { "epoch": 0.6915030863584939, "grad_norm": 31.378975075117445, "learning_rate": 1.3147443994995005e-07, "logits/chosen": -3.03125, "logits/rejected": -3.453125, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.2148, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.21875, "rewards/rejected": -10.125, "step": 17420 }, { "epoch": 0.6919000456503186, "grad_norm": 14.983177855653006, "learning_rate": 1.311695300887134e-07, "logits/chosen": -2.890625, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1192.0, "loss": 0.218, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.46875, "rewards/rejected": -10.3125, "step": 17430 }, { "epoch": 0.6922970049421432, "grad_norm": 25.903741438070455, "learning_rate": 1.3086484841197452e-07, "logits/chosen": -2.90625, "logits/rejected": -3.1875, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.2455, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.78125, "rewards/rejected": -9.8125, "step": 17440 }, { "epoch": 0.6926939642339678, "grad_norm": 23.768667047720193, "learning_rate": 1.3056039550479927e-07, "logits/chosen": -2.828125, "logits/rejected": -2.9375, "logps/chosen": -692.0, "logps/rejected": -1200.0, "loss": 0.2472, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.09375, "rewards/rejected": -10.1875, "step": 17450 }, { "epoch": 0.6930909235257924, "grad_norm": 25.099963218368416, "learning_rate": 1.302561719518142e-07, "logits/chosen": -2.84375, "logits/rejected": -3.0, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2276, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 4.96875, "rewards/rejected": -9.875, "step": 17460 }, { "epoch": 0.6934878828176171, "grad_norm": 27.650206015052376, "learning_rate": 1.2995217833720517e-07, "logits/chosen": -2.71875, "logits/rejected": -2.890625, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.2447, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.875, "rewards/rejected": -9.9375, "step": 17470 }, { "epoch": 0.6938848421094417, "grad_norm": 19.9031322860104, "learning_rate": 1.296484152447167e-07, "logits/chosen": -2.75, "logits/rejected": -2.890625, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.2087, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 4.84375, "rewards/rejected": -9.75, "step": 17480 }, { "epoch": 0.6942818014012663, "grad_norm": 25.781482642704848, "learning_rate": 1.2934488325765053e-07, "logits/chosen": -2.6875, "logits/rejected": -2.890625, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.2232, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.65625, "rewards/margins": 5.1875, "rewards/rejected": -9.875, "step": 17490 }, { "epoch": 0.6946787606930909, "grad_norm": 32.817176706072146, "learning_rate": 1.2904158295886469e-07, "logits/chosen": -2.734375, "logits/rejected": -2.9375, "logps/chosen": -668.0, "logps/rejected": -1120.0, "loss": 0.2243, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 4.59375, "rewards/rejected": -9.5, "step": 17500 }, { "epoch": 0.6950757199849156, "grad_norm": 29.14315267097393, "learning_rate": 1.2873851493077254e-07, "logits/chosen": -2.671875, "logits/rejected": -2.984375, "logps/chosen": -684.0, "logps/rejected": -1192.0, "loss": 0.2155, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.21875, "rewards/rejected": -10.1875, "step": 17510 }, { "epoch": 0.6954726792767402, "grad_norm": 39.242015089601274, "learning_rate": 1.28435679755341e-07, "logits/chosen": -2.875, "logits/rejected": -3.140625, "logps/chosen": -688.0, "logps/rejected": -1184.0, "loss": 0.218, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.21875, "rewards/margins": 4.96875, "rewards/rejected": -10.1875, "step": 17520 }, { "epoch": 0.6958696385685648, "grad_norm": 39.5740664925725, "learning_rate": 1.2813307801409012e-07, "logits/chosen": -2.609375, "logits/rejected": -2.75, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.2439, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.75, "rewards/rejected": -9.625, "step": 17530 }, { "epoch": 0.6962665978603895, "grad_norm": 25.304822304829504, "learning_rate": 1.2783071028809145e-07, "logits/chosen": -2.578125, "logits/rejected": -2.4375, "logps/chosen": -648.0, "logps/rejected": -1208.0, "loss": 0.2382, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.375, "rewards/rejected": -10.25, "step": 17540 }, { "epoch": 0.6966635571522141, "grad_norm": 17.235260885953643, "learning_rate": 1.2752857715796755e-07, "logits/chosen": -2.703125, "logits/rejected": -2.859375, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.2381, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.25, "rewards/rejected": -10.1875, "step": 17550 }, { "epoch": 0.6970605164440387, "grad_norm": 28.32197574307798, "learning_rate": 1.2722667920389029e-07, "logits/chosen": -2.75, "logits/rejected": -2.796875, "logps/chosen": -700.0, "logps/rejected": -1160.0, "loss": 0.2329, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.71875, "rewards/rejected": -9.8125, "step": 17560 }, { "epoch": 0.6974574757358633, "grad_norm": 26.055961941734836, "learning_rate": 1.269250170055799e-07, "logits/chosen": -2.796875, "logits/rejected": -2.96875, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.1875, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.875, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 17570 }, { "epoch": 0.697854435027688, "grad_norm": 23.98700689721203, "learning_rate": 1.2662359114230398e-07, "logits/chosen": -2.75, "logits/rejected": -2.9375, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2309, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 4.6875, "rewards/rejected": -9.5625, "step": 17580 }, { "epoch": 0.6982513943195126, "grad_norm": 35.558019200474035, "learning_rate": 1.2632240219287609e-07, "logits/chosen": -2.734375, "logits/rejected": -2.875, "logps/chosen": -712.0, "logps/rejected": -1168.0, "loss": 0.2439, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 4.875, "rewards/rejected": -10.0625, "step": 17590 }, { "epoch": 0.6986483536113371, "grad_norm": 26.55453818084781, "learning_rate": 1.2602145073565524e-07, "logits/chosen": -2.96875, "logits/rejected": -3.265625, "logps/chosen": -672.0, "logps/rejected": -1128.0, "loss": 0.2504, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.1875, "rewards/margins": 4.65625, "rewards/rejected": -9.875, "step": 17600 }, { "epoch": 0.6990453129031617, "grad_norm": 34.35685935255376, "learning_rate": 1.2572073734854423e-07, "logits/chosen": -2.828125, "logits/rejected": -3.078125, "logps/chosen": -712.0, "logps/rejected": -1152.0, "loss": 0.2552, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.3125, "rewards/margins": 4.625, "rewards/rejected": -9.9375, "step": 17610 }, { "epoch": 0.6994422721949864, "grad_norm": 39.27005639779898, "learning_rate": 1.2542026260898852e-07, "logits/chosen": -2.828125, "logits/rejected": -3.15625, "logps/chosen": -692.0, "logps/rejected": -1144.0, "loss": 0.2637, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.25, "rewards/margins": 4.71875, "rewards/rejected": -9.9375, "step": 17620 }, { "epoch": 0.699839231486811, "grad_norm": 15.554348265274747, "learning_rate": 1.2512002709397552e-07, "logits/chosen": -2.8125, "logits/rejected": -3.171875, "logps/chosen": -676.0, "logps/rejected": -1152.0, "loss": 0.2025, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 5.0625, "rewards/rejected": -10.125, "step": 17630 }, { "epoch": 0.7002361907786356, "grad_norm": 21.401865566521593, "learning_rate": 1.2482003138003325e-07, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -720.0, "logps/rejected": -1160.0, "loss": 0.2177, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.6875, "rewards/rejected": -9.75, "step": 17640 }, { "epoch": 0.7006331500704602, "grad_norm": 28.75792552671402, "learning_rate": 1.2452027604322907e-07, "logits/chosen": -2.84375, "logits/rejected": -3.09375, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.2424, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.875, "rewards/margins": 4.96875, "rewards/rejected": -9.875, "step": 17650 }, { "epoch": 0.7010301093622849, "grad_norm": 29.813372113962654, "learning_rate": 1.2422076165916908e-07, "logits/chosen": -2.890625, "logits/rejected": -3.0, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2426, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.15625, "rewards/rejected": -10.0625, "step": 17660 }, { "epoch": 0.7014270686541095, "grad_norm": 39.25526757315878, "learning_rate": 1.239214888029964e-07, "logits/chosen": -2.75, "logits/rejected": -2.984375, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2452, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.75, "rewards/rejected": -9.75, "step": 17670 }, { "epoch": 0.7018240279459341, "grad_norm": 34.23404269129969, "learning_rate": 1.236224580493904e-07, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -692.0, "logps/rejected": -1120.0, "loss": 0.2527, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 4.3125, "rewards/rejected": -9.5, "step": 17680 }, { "epoch": 0.7022209872377587, "grad_norm": 22.782373152413072, "learning_rate": 1.2332366997256576e-07, "logits/chosen": -2.71875, "logits/rejected": -3.03125, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.2151, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75, "rewards/margins": 4.96875, "rewards/rejected": -9.75, "step": 17690 }, { "epoch": 0.7026179465295834, "grad_norm": 34.844006395503776, "learning_rate": 1.2302512514627082e-07, "logits/chosen": -2.75, "logits/rejected": -2.828125, "logps/chosen": -656.0, "logps/rejected": -1144.0, "loss": 0.2622, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.875, "rewards/margins": 4.875, "rewards/rejected": -9.75, "step": 17700 }, { "epoch": 0.703014905821408, "grad_norm": 33.35331652482632, "learning_rate": 1.227268241437872e-07, "logits/chosen": -2.765625, "logits/rejected": -2.921875, "logps/chosen": -684.0, "logps/rejected": -1176.0, "loss": 0.2161, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 17710 }, { "epoch": 0.7034118651132326, "grad_norm": 28.883681204855687, "learning_rate": 1.2242876753792804e-07, "logits/chosen": -2.640625, "logits/rejected": -2.859375, "logps/chosen": -644.0, "logps/rejected": -1192.0, "loss": 0.2147, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.75, "rewards/margins": 5.5, "rewards/rejected": -10.25, "step": 17720 }, { "epoch": 0.7038088244050572, "grad_norm": 28.69020788146865, "learning_rate": 1.2213095590103717e-07, "logits/chosen": -2.734375, "logits/rejected": -3.125, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.2145, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 17730 }, { "epoch": 0.7042057836968819, "grad_norm": 23.974110222699846, "learning_rate": 1.2183338980498828e-07, "logits/chosen": -2.71875, "logits/rejected": -2.96875, "logps/chosen": -628.0, "logps/rejected": -1144.0, "loss": 0.191, "rewards/accuracies": 0.96875, "rewards/chosen": -4.5, "rewards/margins": 5.3125, "rewards/rejected": -9.8125, "step": 17740 }, { "epoch": 0.7046027429887065, "grad_norm": 22.23329681875104, "learning_rate": 1.215360698211832e-07, "logits/chosen": -2.640625, "logits/rejected": -2.984375, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.2341, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.875, "rewards/margins": 5.0, "rewards/rejected": -9.875, "step": 17750 }, { "epoch": 0.7049997022805311, "grad_norm": 26.79122594542984, "learning_rate": 1.2123899652055154e-07, "logits/chosen": -2.796875, "logits/rejected": -2.921875, "logps/chosen": -700.0, "logps/rejected": -1136.0, "loss": 0.253, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.1875, "rewards/margins": 4.65625, "rewards/rejected": -9.8125, "step": 17760 }, { "epoch": 0.7053966615723558, "grad_norm": 31.520019407693905, "learning_rate": 1.2094217047354884e-07, "logits/chosen": -2.765625, "logits/rejected": -2.75, "logps/chosen": -644.0, "logps/rejected": -1112.0, "loss": 0.2286, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 4.625, "rewards/rejected": -9.5, "step": 17770 }, { "epoch": 0.7057936208641804, "grad_norm": 20.06426640897768, "learning_rate": 1.206455922501562e-07, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.237, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 17780 }, { "epoch": 0.706190580156005, "grad_norm": 44.76055935167152, "learning_rate": 1.2034926241987864e-07, "logits/chosen": -2.65625, "logits/rejected": -2.9375, "logps/chosen": -672.0, "logps/rejected": -1200.0, "loss": 0.2802, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 17790 }, { "epoch": 0.7065875394478296, "grad_norm": 42.842855987026994, "learning_rate": 1.200531815517441e-07, "logits/chosen": -2.71875, "logits/rejected": -3.125, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2231, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 17800 }, { "epoch": 0.7069844987396543, "grad_norm": 39.36712400485948, "learning_rate": 1.1975735021430278e-07, "logits/chosen": -2.75, "logits/rejected": -3.109375, "logps/chosen": -676.0, "logps/rejected": -1112.0, "loss": 0.2508, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.5, "rewards/rejected": -9.625, "step": 17810 }, { "epoch": 0.7073814580314789, "grad_norm": 26.583316770420424, "learning_rate": 1.194617689756253e-07, "logits/chosen": -2.84375, "logits/rejected": -2.921875, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2163, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 5.0625, "rewards/rejected": -10.125, "step": 17820 }, { "epoch": 0.7077784173233035, "grad_norm": 27.87964105259203, "learning_rate": 1.1916643840330248e-07, "logits/chosen": -2.734375, "logits/rejected": -2.78125, "logps/chosen": -692.0, "logps/rejected": -1144.0, "loss": 0.2503, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 4.65625, "rewards/rejected": -9.8125, "step": 17830 }, { "epoch": 0.7081753766151281, "grad_norm": 23.447918298742916, "learning_rate": 1.1887135906444348e-07, "logits/chosen": -2.703125, "logits/rejected": -3.03125, "logps/chosen": -676.0, "logps/rejected": -1160.0, "loss": 0.2583, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 4.84375, "rewards/rejected": -9.875, "step": 17840 }, { "epoch": 0.7085723359069528, "grad_norm": 24.82293413023745, "learning_rate": 1.1857653152567506e-07, "logits/chosen": -2.859375, "logits/rejected": -3.0, "logps/chosen": -688.0, "logps/rejected": -1176.0, "loss": 0.254, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.78125, "rewards/rejected": -9.875, "step": 17850 }, { "epoch": 0.7089692951987774, "grad_norm": 21.276119586745956, "learning_rate": 1.182819563531405e-07, "logits/chosen": -2.78125, "logits/rejected": -3.046875, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.257, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 4.8125, "rewards/rejected": -9.8125, "step": 17860 }, { "epoch": 0.709366254490602, "grad_norm": 14.441022315669862, "learning_rate": 1.1798763411249852e-07, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.2286, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.78125, "rewards/rejected": -9.75, "step": 17870 }, { "epoch": 0.7097632137824266, "grad_norm": 24.484995396410476, "learning_rate": 1.1769356536892225e-07, "logits/chosen": -2.765625, "logits/rejected": -3.046875, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.2274, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 4.9375, "rewards/rejected": -9.9375, "step": 17880 }, { "epoch": 0.7101601730742513, "grad_norm": 22.492779288631326, "learning_rate": 1.1739975068709776e-07, "logits/chosen": -2.796875, "logits/rejected": -3.046875, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.2339, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.1875, "rewards/margins": 5.09375, "rewards/rejected": -10.25, "step": 17890 }, { "epoch": 0.7105571323660759, "grad_norm": 36.602503816922855, "learning_rate": 1.1710619063122349e-07, "logits/chosen": -2.6875, "logits/rejected": -2.875, "logps/chosen": -688.0, "logps/rejected": -1200.0, "loss": 0.2782, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 5.21875, "rewards/rejected": -10.1875, "step": 17900 }, { "epoch": 0.7109540916579005, "grad_norm": 20.573555911852882, "learning_rate": 1.1681288576500867e-07, "logits/chosen": -2.890625, "logits/rejected": -3.125, "logps/chosen": -640.0, "logps/rejected": -1168.0, "loss": 0.2299, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 5.28125, "rewards/rejected": -10.0625, "step": 17910 }, { "epoch": 0.7113510509497251, "grad_norm": 29.035300811200138, "learning_rate": 1.1651983665167283e-07, "logits/chosen": -2.84375, "logits/rejected": -2.890625, "logps/chosen": -688.0, "logps/rejected": -1176.0, "loss": 0.2627, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.21875, "rewards/margins": 4.90625, "rewards/rejected": -10.125, "step": 17920 }, { "epoch": 0.7117480102415498, "grad_norm": 34.671776457750454, "learning_rate": 1.162270438539443e-07, "logits/chosen": -2.859375, "logits/rejected": -3.046875, "logps/chosen": -680.0, "logps/rejected": -1160.0, "loss": 0.2337, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.90625, "rewards/rejected": -10.0, "step": 17930 }, { "epoch": 0.7121449695333744, "grad_norm": 20.611165918159983, "learning_rate": 1.159345079340591e-07, "logits/chosen": -2.765625, "logits/rejected": -2.953125, "logps/chosen": -724.0, "logps/rejected": -1184.0, "loss": 0.2242, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.375, "rewards/margins": 4.71875, "rewards/rejected": -10.125, "step": 17940 }, { "epoch": 0.712541928825199, "grad_norm": 20.34756864005941, "learning_rate": 1.156422294537599e-07, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -684.0, "logps/rejected": -1200.0, "loss": 0.2337, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.25, "rewards/rejected": -10.25, "step": 17950 }, { "epoch": 0.7129388881170236, "grad_norm": 21.958088072767826, "learning_rate": 1.1535020897429529e-07, "logits/chosen": -2.734375, "logits/rejected": -3.0, "logps/chosen": -740.0, "logps/rejected": -1184.0, "loss": 0.2254, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.4375, "rewards/margins": 4.65625, "rewards/rejected": -10.0625, "step": 17960 }, { "epoch": 0.7133358474088483, "grad_norm": 35.40222249023326, "learning_rate": 1.1505844705641841e-07, "logits/chosen": -2.890625, "logits/rejected": -3.1875, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.2331, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.90625, "rewards/rejected": -9.9375, "step": 17970 }, { "epoch": 0.7137328067006729, "grad_norm": 25.58307300763323, "learning_rate": 1.1476694426038564e-07, "logits/chosen": -2.734375, "logits/rejected": -2.984375, "logps/chosen": -688.0, "logps/rejected": -1176.0, "loss": 0.2236, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0, "rewards/margins": 5.1875, "rewards/rejected": -10.1875, "step": 17980 }, { "epoch": 0.7141297659924974, "grad_norm": 22.959283280576187, "learning_rate": 1.1447570114595598e-07, "logits/chosen": -2.625, "logits/rejected": -3.0, "logps/chosen": -704.0, "logps/rejected": -1192.0, "loss": 0.2199, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 5.03125, "rewards/rejected": -10.1875, "step": 17990 }, { "epoch": 0.714526725284322, "grad_norm": 30.407461015082013, "learning_rate": 1.1418471827238962e-07, "logits/chosen": -2.734375, "logits/rejected": -3.03125, "logps/chosen": -712.0, "logps/rejected": -1224.0, "loss": 0.2175, "rewards/accuracies": 0.96875, "rewards/chosen": -5.1875, "rewards/margins": 5.1875, "rewards/rejected": -10.375, "step": 18000 }, { "epoch": 0.714526725284322, "eval_logits/chosen": -2.75, "eval_logits/rejected": -2.96875, "eval_logps/chosen": -708.0, "eval_logps/rejected": -1104.0, "eval_loss": 0.25117385387420654, "eval_rewards/accuracies": 0.8939311504364014, "eval_rewards/chosen": -5.21875, "eval_rewards/margins": 4.15625, "eval_rewards/rejected": -9.375, "eval_runtime": 5422.8824, "eval_samples_per_second": 32.573, "eval_steps_per_second": 0.509, "step": 18000 }, { "epoch": 0.7149236845761467, "grad_norm": 33.04249241577209, "learning_rate": 1.1389399619844722e-07, "logits/chosen": -2.8125, "logits/rejected": -3.21875, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.2546, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.1875, "rewards/rejected": -10.25, "step": 18010 }, { "epoch": 0.7153206438679713, "grad_norm": 22.753144944913302, "learning_rate": 1.1360353548238864e-07, "logits/chosen": -2.640625, "logits/rejected": -2.96875, "logps/chosen": -704.0, "logps/rejected": -1200.0, "loss": 0.2505, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.3125, "rewards/margins": 5.0, "rewards/rejected": -10.3125, "step": 18020 }, { "epoch": 0.7157176031597959, "grad_norm": 31.763094123203437, "learning_rate": 1.1331333668197168e-07, "logits/chosen": -2.65625, "logits/rejected": -2.9375, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.2266, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 4.6875, "rewards/rejected": -9.6875, "step": 18030 }, { "epoch": 0.7161145624516206, "grad_norm": 32.69882894141711, "learning_rate": 1.1302340035445126e-07, "logits/chosen": -2.734375, "logits/rejected": -3.015625, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.2187, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 18040 }, { "epoch": 0.7165115217434452, "grad_norm": 28.58213116702003, "learning_rate": 1.127337270565783e-07, "logits/chosen": -2.65625, "logits/rejected": -2.953125, "logps/chosen": -724.0, "logps/rejected": -1168.0, "loss": 0.2519, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.25, "rewards/margins": 4.71875, "rewards/rejected": -10.0, "step": 18050 }, { "epoch": 0.7169084810352698, "grad_norm": 25.557882770947423, "learning_rate": 1.1244431734459866e-07, "logits/chosen": -2.671875, "logits/rejected": -2.8125, "logps/chosen": -684.0, "logps/rejected": -1232.0, "loss": 0.2378, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.5, "rewards/rejected": -10.5, "step": 18060 }, { "epoch": 0.7173054403270944, "grad_norm": 26.24075764018714, "learning_rate": 1.1215517177425221e-07, "logits/chosen": -2.828125, "logits/rejected": -2.921875, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.2327, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 4.875, "rewards/rejected": -9.8125, "step": 18070 }, { "epoch": 0.7177023996189191, "grad_norm": 25.936422481527117, "learning_rate": 1.118662909007713e-07, "logits/chosen": -2.625, "logits/rejected": -2.859375, "logps/chosen": -684.0, "logps/rejected": -1152.0, "loss": 0.2304, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.6875, "rewards/rejected": -9.75, "step": 18080 }, { "epoch": 0.7180993589107437, "grad_norm": 28.738176283195212, "learning_rate": 1.115776752788801e-07, "logits/chosen": -2.8125, "logits/rejected": -3.109375, "logps/chosen": -692.0, "logps/rejected": -1136.0, "loss": 0.188, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 4.625, "rewards/rejected": -9.8125, "step": 18090 }, { "epoch": 0.7184963182025683, "grad_norm": 13.541465395427016, "learning_rate": 1.112893254627934e-07, "logits/chosen": -2.625, "logits/rejected": -2.78125, "logps/chosen": -688.0, "logps/rejected": -1160.0, "loss": 0.2152, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 4.875, "rewards/rejected": -9.875, "step": 18100 }, { "epoch": 0.7188932774943929, "grad_norm": 32.89216542679686, "learning_rate": 1.1100124200621572e-07, "logits/chosen": -2.6875, "logits/rejected": -2.59375, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.2799, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 18110 }, { "epoch": 0.7192902367862176, "grad_norm": 37.59797504826882, "learning_rate": 1.1071342546234011e-07, "logits/chosen": -2.84375, "logits/rejected": -3.09375, "logps/chosen": -656.0, "logps/rejected": -1096.0, "loss": 0.2421, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.46875, "rewards/rejected": -9.5625, "step": 18120 }, { "epoch": 0.7196871960780422, "grad_norm": 29.112631288414928, "learning_rate": 1.104258763838469e-07, "logits/chosen": -2.71875, "logits/rejected": -2.984375, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.23, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.96875, "rewards/rejected": -10.0, "step": 18130 }, { "epoch": 0.7200841553698668, "grad_norm": 26.592045691068954, "learning_rate": 1.1013859532290282e-07, "logits/chosen": -2.71875, "logits/rejected": -2.78125, "logps/chosen": -680.0, "logps/rejected": -1144.0, "loss": 0.2425, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 4.65625, "rewards/rejected": -9.6875, "step": 18140 }, { "epoch": 0.7204811146616914, "grad_norm": 24.055266253310876, "learning_rate": 1.0985158283116016e-07, "logits/chosen": -2.734375, "logits/rejected": -3.09375, "logps/chosen": -668.0, "logps/rejected": -1120.0, "loss": 0.2224, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 4.78125, "rewards/rejected": -9.75, "step": 18150 }, { "epoch": 0.7208780739535161, "grad_norm": 28.827373612287353, "learning_rate": 1.0956483945975531e-07, "logits/chosen": -2.75, "logits/rejected": -3.03125, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.1888, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.875, "rewards/margins": 5.40625, "rewards/rejected": -10.3125, "step": 18160 }, { "epoch": 0.7212750332453407, "grad_norm": 29.59383919699462, "learning_rate": 1.0927836575930782e-07, "logits/chosen": -2.71875, "logits/rejected": -2.796875, "logps/chosen": -684.0, "logps/rejected": -1192.0, "loss": 0.2582, "rewards/accuracies": 0.9375, "rewards/chosen": -5.21875, "rewards/margins": 4.9375, "rewards/rejected": -10.125, "step": 18170 }, { "epoch": 0.7216719925371653, "grad_norm": 33.16984223209975, "learning_rate": 1.089921622799197e-07, "logits/chosen": -2.71875, "logits/rejected": -2.859375, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2269, "rewards/accuracies": 0.9375, "rewards/chosen": -4.84375, "rewards/margins": 5.09375, "rewards/rejected": -9.9375, "step": 18180 }, { "epoch": 0.7220689518289899, "grad_norm": 40.35708700906692, "learning_rate": 1.0870622957117364e-07, "logits/chosen": -2.75, "logits/rejected": -2.9375, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2566, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.21875, "rewards/rejected": -10.1875, "step": 18190 }, { "epoch": 0.7224659111208146, "grad_norm": 17.45494539846238, "learning_rate": 1.0842056818213286e-07, "logits/chosen": -2.5, "logits/rejected": -2.765625, "logps/chosen": -696.0, "logps/rejected": -1184.0, "loss": 0.2668, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.96875, "rewards/rejected": -10.0625, "step": 18200 }, { "epoch": 0.7228628704126392, "grad_norm": 25.177526891985742, "learning_rate": 1.0813517866133923e-07, "logits/chosen": -2.703125, "logits/rejected": -2.875, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.2091, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.90625, "rewards/rejected": -9.8125, "step": 18210 }, { "epoch": 0.7232598297044638, "grad_norm": 21.29155968546336, "learning_rate": 1.0785006155681257e-07, "logits/chosen": -2.953125, "logits/rejected": -2.953125, "logps/chosen": -628.0, "logps/rejected": -1168.0, "loss": 0.2193, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 5.375, "rewards/rejected": -10.0, "step": 18220 }, { "epoch": 0.7236567889962884, "grad_norm": 32.35957990892271, "learning_rate": 1.0756521741604987e-07, "logits/chosen": -2.59375, "logits/rejected": -2.84375, "logps/chosen": -676.0, "logps/rejected": -1176.0, "loss": 0.2168, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 18230 }, { "epoch": 0.7240537482881131, "grad_norm": 19.755322857697013, "learning_rate": 1.0728064678602358e-07, "logits/chosen": -2.59375, "logits/rejected": -2.796875, "logps/chosen": -664.0, "logps/rejected": -1192.0, "loss": 0.2085, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 5.21875, "rewards/rejected": -10.1875, "step": 18240 }, { "epoch": 0.7244507075799377, "grad_norm": 33.240590918843544, "learning_rate": 1.0699635021318137e-07, "logits/chosen": -2.796875, "logits/rejected": -2.984375, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2746, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 18250 }, { "epoch": 0.7248476668717623, "grad_norm": 25.450571831120534, "learning_rate": 1.067123282434443e-07, "logits/chosen": -2.796875, "logits/rejected": -3.0, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.2252, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 18260 }, { "epoch": 0.7252446261635869, "grad_norm": 30.892295854976634, "learning_rate": 1.064285814222062e-07, "logits/chosen": -2.734375, "logits/rejected": -2.875, "logps/chosen": -652.0, "logps/rejected": -1232.0, "loss": 0.2428, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.75, "rewards/rejected": -10.5625, "step": 18270 }, { "epoch": 0.7256415854554116, "grad_norm": 26.85617114677738, "learning_rate": 1.0614511029433262e-07, "logits/chosen": -2.6875, "logits/rejected": -3.09375, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2352, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 18280 }, { "epoch": 0.7260385447472362, "grad_norm": 20.928303255246824, "learning_rate": 1.0586191540415978e-07, "logits/chosen": -2.65625, "logits/rejected": -3.03125, "logps/chosen": -592.0, "logps/rejected": -1112.0, "loss": 0.2105, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.34375, "rewards/margins": 5.09375, "rewards/rejected": -9.4375, "step": 18290 }, { "epoch": 0.7264355040390608, "grad_norm": 26.858699795164725, "learning_rate": 1.055789972954933e-07, "logits/chosen": -2.8125, "logits/rejected": -2.84375, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.2351, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.84375, "rewards/margins": 4.90625, "rewards/rejected": -9.75, "step": 18300 }, { "epoch": 0.7268324633308855, "grad_norm": 46.32110952045679, "learning_rate": 1.0529635651160737e-07, "logits/chosen": -2.578125, "logits/rejected": -2.8125, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.2267, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.125, "rewards/rejected": -10.0625, "step": 18310 }, { "epoch": 0.7272294226227101, "grad_norm": 33.815104794893855, "learning_rate": 1.0501399359524349e-07, "logits/chosen": -2.6875, "logits/rejected": -2.8125, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.2364, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0, "rewards/margins": 4.78125, "rewards/rejected": -9.8125, "step": 18320 }, { "epoch": 0.7276263819145347, "grad_norm": 27.883796927870165, "learning_rate": 1.0473190908860988e-07, "logits/chosen": -2.75, "logits/rejected": -3.078125, "logps/chosen": -684.0, "logps/rejected": -1192.0, "loss": 0.2331, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.15625, "rewards/margins": 5.1875, "rewards/rejected": -10.375, "step": 18330 }, { "epoch": 0.7280233412063593, "grad_norm": 27.413025891459352, "learning_rate": 1.0445010353338005e-07, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -652.0, "logps/rejected": -1168.0, "loss": 0.2191, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.09375, "rewards/rejected": -10.0625, "step": 18340 }, { "epoch": 0.728420300498184, "grad_norm": 19.462922086193636, "learning_rate": 1.0416857747069175e-07, "logits/chosen": -2.6875, "logits/rejected": -2.734375, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.2064, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 18350 }, { "epoch": 0.7288172597900086, "grad_norm": 20.635682878928293, "learning_rate": 1.0388733144114603e-07, "logits/chosen": -2.6875, "logits/rejected": -3.140625, "logps/chosen": -656.0, "logps/rejected": -1152.0, "loss": 0.2218, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 18360 }, { "epoch": 0.7292142190818331, "grad_norm": 31.652423123291076, "learning_rate": 1.0360636598480624e-07, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -644.0, "logps/rejected": -1128.0, "loss": 0.2319, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.75, "rewards/margins": 4.84375, "rewards/rejected": -9.625, "step": 18370 }, { "epoch": 0.7296111783736577, "grad_norm": 26.555800009461258, "learning_rate": 1.0332568164119701e-07, "logits/chosen": -2.859375, "logits/rejected": -3.109375, "logps/chosen": -644.0, "logps/rejected": -1128.0, "loss": 0.2165, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 4.75, "rewards/rejected": -9.625, "step": 18380 }, { "epoch": 0.7300081376654824, "grad_norm": 19.060231339139694, "learning_rate": 1.0304527894930327e-07, "logits/chosen": -2.71875, "logits/rejected": -2.71875, "logps/chosen": -672.0, "logps/rejected": -1240.0, "loss": 0.2347, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.59375, "rewards/rejected": -10.625, "step": 18390 }, { "epoch": 0.730405096957307, "grad_norm": 24.55678627331702, "learning_rate": 1.027651584475688e-07, "logits/chosen": -2.859375, "logits/rejected": -2.9375, "logps/chosen": -692.0, "logps/rejected": -1208.0, "loss": 0.2307, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.3125, "rewards/margins": 5.125, "rewards/rejected": -10.4375, "step": 18400 }, { "epoch": 0.7308020562491316, "grad_norm": 24.060356673533846, "learning_rate": 1.0248532067389579e-07, "logits/chosen": -2.796875, "logits/rejected": -3.09375, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2208, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.34375, "rewards/rejected": -10.3125, "step": 18410 }, { "epoch": 0.7311990155409562, "grad_norm": 27.726802112187084, "learning_rate": 1.0220576616564319e-07, "logits/chosen": -2.8125, "logits/rejected": -2.859375, "logps/chosen": -640.0, "logps/rejected": -1200.0, "loss": 0.2243, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.46875, "rewards/rejected": -10.25, "step": 18420 }, { "epoch": 0.7315959748327809, "grad_norm": 27.851347955365064, "learning_rate": 1.0192649545962639e-07, "logits/chosen": -2.703125, "logits/rejected": -2.921875, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2214, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 18430 }, { "epoch": 0.7319929341246055, "grad_norm": 28.649444507395295, "learning_rate": 1.0164750909211572e-07, "logits/chosen": -2.796875, "logits/rejected": -3.046875, "logps/chosen": -640.0, "logps/rejected": -1192.0, "loss": 0.2498, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.46875, "rewards/rejected": -10.25, "step": 18440 }, { "epoch": 0.7323898934164301, "grad_norm": 31.202735413883097, "learning_rate": 1.0136880759883531e-07, "logits/chosen": -2.703125, "logits/rejected": -2.84375, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2596, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.25, "rewards/rejected": -10.0625, "step": 18450 }, { "epoch": 0.7327868527082547, "grad_norm": 34.863472366765556, "learning_rate": 1.0109039151496233e-07, "logits/chosen": -2.671875, "logits/rejected": -2.90625, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.2397, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 4.90625, "rewards/rejected": -9.875, "step": 18460 }, { "epoch": 0.7331838120000794, "grad_norm": 27.045687332989885, "learning_rate": 1.0081226137512607e-07, "logits/chosen": -2.734375, "logits/rejected": -2.8125, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.232, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.25, "rewards/rejected": -10.25, "step": 18470 }, { "epoch": 0.733580771291904, "grad_norm": 16.627072117254034, "learning_rate": 1.0053441771340648e-07, "logits/chosen": -2.765625, "logits/rejected": -2.921875, "logps/chosen": -728.0, "logps/rejected": -1192.0, "loss": 0.2186, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.59375, "rewards/margins": 4.71875, "rewards/rejected": -10.3125, "step": 18480 }, { "epoch": 0.7339777305837286, "grad_norm": 17.273611150530627, "learning_rate": 1.0025686106333367e-07, "logits/chosen": -2.671875, "logits/rejected": -2.875, "logps/chosen": -696.0, "logps/rejected": -1192.0, "loss": 0.2366, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.0625, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 18490 }, { "epoch": 0.7343746898755532, "grad_norm": 34.85506598745356, "learning_rate": 9.997959195788638e-08, "logits/chosen": -2.828125, "logits/rejected": -3.078125, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2102, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 18500 }, { "epoch": 0.7347716491673779, "grad_norm": 27.451823528560194, "learning_rate": 9.97026109294912e-08, "logits/chosen": -2.84375, "logits/rejected": -3.046875, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.1978, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 18510 }, { "epoch": 0.7351686084592025, "grad_norm": 24.857554308743335, "learning_rate": 9.942591851002178e-08, "logits/chosen": -2.84375, "logits/rejected": -3.03125, "logps/chosen": -704.0, "logps/rejected": -1240.0, "loss": 0.2045, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.4375, "rewards/rejected": -10.5, "step": 18520 }, { "epoch": 0.7355655677510271, "grad_norm": 33.468093174462446, "learning_rate": 9.914951523079726e-08, "logits/chosen": -2.8125, "logits/rejected": -2.984375, "logps/chosen": -640.0, "logps/rejected": -1160.0, "loss": 0.2474, "rewards/accuracies": 0.96875, "rewards/chosen": -4.84375, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 18530 }, { "epoch": 0.7359625270428518, "grad_norm": 38.32100855232743, "learning_rate": 9.88734016225819e-08, "logits/chosen": -2.8125, "logits/rejected": -2.953125, "logps/chosen": -644.0, "logps/rejected": -1184.0, "loss": 0.1974, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 5.3125, "rewards/rejected": -10.0, "step": 18540 }, { "epoch": 0.7363594863346764, "grad_norm": 24.206026453230077, "learning_rate": 9.859757821558337e-08, "logits/chosen": -2.671875, "logits/rejected": -2.84375, "logps/chosen": -708.0, "logps/rejected": -1200.0, "loss": 0.2003, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.3125, "rewards/margins": 4.96875, "rewards/rejected": -10.25, "step": 18550 }, { "epoch": 0.736756445626501, "grad_norm": 21.82488441970588, "learning_rate": 9.832204553945222e-08, "logits/chosen": -2.71875, "logits/rejected": -2.890625, "logps/chosen": -700.0, "logps/rejected": -1192.0, "loss": 0.1921, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.09375, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 18560 }, { "epoch": 0.7371534049183256, "grad_norm": 21.172217576287306, "learning_rate": 9.804680412328086e-08, "logits/chosen": -2.703125, "logits/rejected": -3.171875, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.2323, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.96875, "rewards/rejected": -10.0625, "step": 18570 }, { "epoch": 0.7375503642101503, "grad_norm": 30.261107516549526, "learning_rate": 9.777185449560216e-08, "logits/chosen": -2.78125, "logits/rejected": -2.90625, "logps/chosen": -684.0, "logps/rejected": -1152.0, "loss": 0.2457, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 4.59375, "rewards/rejected": -9.75, "step": 18580 }, { "epoch": 0.7379473235019749, "grad_norm": 15.067533384567392, "learning_rate": 9.749719718438895e-08, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -704.0, "logps/rejected": -1200.0, "loss": 0.1898, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.25, "rewards/margins": 5.125, "rewards/rejected": -10.375, "step": 18590 }, { "epoch": 0.7383442827937995, "grad_norm": 31.556014880757807, "learning_rate": 9.722283271705253e-08, "logits/chosen": -2.890625, "logits/rejected": -3.171875, "logps/chosen": -652.0, "logps/rejected": -1208.0, "loss": 0.2241, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.375, "rewards/rejected": -10.25, "step": 18600 }, { "epoch": 0.7387412420856241, "grad_norm": 26.017224856339926, "learning_rate": 9.694876162044182e-08, "logits/chosen": -2.796875, "logits/rejected": -2.96875, "logps/chosen": -684.0, "logps/rejected": -1224.0, "loss": 0.221, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 5.59375, "rewards/rejected": -10.625, "step": 18610 }, { "epoch": 0.7391382013774488, "grad_norm": 18.51445411012914, "learning_rate": 9.667498442084271e-08, "logits/chosen": -2.71875, "logits/rejected": -2.859375, "logps/chosen": -644.0, "logps/rejected": -1160.0, "loss": 0.2037, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.0, "rewards/rejected": -9.8125, "step": 18620 }, { "epoch": 0.7395351606692734, "grad_norm": 33.87791168916785, "learning_rate": 9.640150164397631e-08, "logits/chosen": -2.765625, "logits/rejected": -3.015625, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.2015, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.28125, "rewards/rejected": -10.1875, "step": 18630 }, { "epoch": 0.739932119961098, "grad_norm": 24.541309479437025, "learning_rate": 9.612831381499883e-08, "logits/chosen": -2.609375, "logits/rejected": -2.8125, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.1945, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.28125, "rewards/rejected": -10.0625, "step": 18640 }, { "epoch": 0.7403290792529226, "grad_norm": 23.27511901090879, "learning_rate": 9.585542145849965e-08, "logits/chosen": -2.6875, "logits/rejected": -2.875, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2093, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.21875, "rewards/rejected": -10.125, "step": 18650 }, { "epoch": 0.7407260385447473, "grad_norm": 26.529787102708006, "learning_rate": 9.558282509850116e-08, "logits/chosen": -2.890625, "logits/rejected": -2.875, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.2169, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.90625, "rewards/rejected": -9.9375, "step": 18660 }, { "epoch": 0.7411229978365719, "grad_norm": 21.062541391995623, "learning_rate": 9.531052525845707e-08, "logits/chosen": -2.703125, "logits/rejected": -2.9375, "logps/chosen": -664.0, "logps/rejected": -1176.0, "loss": 0.2205, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 18670 }, { "epoch": 0.7415199571283965, "grad_norm": 32.30131881200257, "learning_rate": 9.503852246125183e-08, "logits/chosen": -2.75, "logits/rejected": -2.75, "logps/chosen": -740.0, "logps/rejected": -1256.0, "loss": 0.2439, "rewards/accuracies": 0.9375, "rewards/chosen": -5.40625, "rewards/margins": 5.03125, "rewards/rejected": -10.4375, "step": 18680 }, { "epoch": 0.7419169164202211, "grad_norm": 28.41776060187165, "learning_rate": 9.476681722919938e-08, "logits/chosen": -2.90625, "logits/rejected": -3.109375, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.2508, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 5.0, "rewards/rejected": -10.25, "step": 18690 }, { "epoch": 0.7423138757120458, "grad_norm": 27.866595884151522, "learning_rate": 9.449541008404244e-08, "logits/chosen": -2.78125, "logits/rejected": -3.03125, "logps/chosen": -684.0, "logps/rejected": -1160.0, "loss": 0.2697, "rewards/accuracies": 0.9375, "rewards/chosen": -4.90625, "rewards/margins": 4.84375, "rewards/rejected": -9.75, "step": 18700 }, { "epoch": 0.7427108350038704, "grad_norm": 24.417684229910236, "learning_rate": 9.422430154695129e-08, "logits/chosen": -2.765625, "logits/rejected": -2.859375, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.2164, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.46875, "rewards/rejected": -10.25, "step": 18710 }, { "epoch": 0.743107794295695, "grad_norm": 44.45810316030979, "learning_rate": 9.395349213852274e-08, "logits/chosen": -2.671875, "logits/rejected": -2.96875, "logps/chosen": -676.0, "logps/rejected": -1144.0, "loss": 0.2253, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.0, "rewards/margins": 4.59375, "rewards/rejected": -9.5625, "step": 18720 }, { "epoch": 0.7435047535875196, "grad_norm": 32.78753823094479, "learning_rate": 9.368298237877916e-08, "logits/chosen": -2.953125, "logits/rejected": -3.046875, "logps/chosen": -696.0, "logps/rejected": -1184.0, "loss": 0.2383, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.34375, "rewards/margins": 4.90625, "rewards/rejected": -10.25, "step": 18730 }, { "epoch": 0.7439017128793443, "grad_norm": 24.031139988795697, "learning_rate": 9.341277278716753e-08, "logits/chosen": -2.703125, "logits/rejected": -3.09375, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.2522, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.0, "rewards/margins": 4.875, "rewards/rejected": -9.875, "step": 18740 }, { "epoch": 0.7442986721711689, "grad_norm": 19.70249960056739, "learning_rate": 9.314286388255848e-08, "logits/chosen": -2.796875, "logits/rejected": -3.15625, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2137, "rewards/accuracies": 0.96875, "rewards/chosen": -4.9375, "rewards/margins": 5.28125, "rewards/rejected": -10.25, "step": 18750 }, { "epoch": 0.7446956314629934, "grad_norm": 40.74029143349819, "learning_rate": 9.28732561832454e-08, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2318, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 18760 }, { "epoch": 0.745092590754818, "grad_norm": 26.75325843144008, "learning_rate": 9.2603950206943e-08, "logits/chosen": -2.75, "logits/rejected": -3.078125, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.2494, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 18770 }, { "epoch": 0.7454895500466427, "grad_norm": 25.26438781712608, "learning_rate": 9.233494647078676e-08, "logits/chosen": -2.8125, "logits/rejected": -3.0625, "logps/chosen": -688.0, "logps/rejected": -1136.0, "loss": 0.2187, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 4.84375, "rewards/rejected": -9.8125, "step": 18780 }, { "epoch": 0.7458865093384673, "grad_norm": 22.55681572845801, "learning_rate": 9.206624549133168e-08, "logits/chosen": -2.75, "logits/rejected": -3.078125, "logps/chosen": -648.0, "logps/rejected": -1136.0, "loss": 0.2119, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 5.15625, "rewards/rejected": -9.8125, "step": 18790 }, { "epoch": 0.7462834686302919, "grad_norm": 26.85756299400725, "learning_rate": 9.179784778455152e-08, "logits/chosen": -2.890625, "logits/rejected": -3.265625, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2349, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.0625, "rewards/margins": 5.3125, "rewards/rejected": -10.375, "step": 18800 }, { "epoch": 0.7466804279221166, "grad_norm": 34.18821215636006, "learning_rate": 9.152975386583772e-08, "logits/chosen": -2.796875, "logits/rejected": -2.953125, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.2574, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.8125, "rewards/margins": 4.8125, "rewards/rejected": -9.625, "step": 18810 }, { "epoch": 0.7470773872139412, "grad_norm": 37.58111617465607, "learning_rate": 9.126196424999819e-08, "logits/chosen": -2.75, "logits/rejected": -2.96875, "logps/chosen": -676.0, "logps/rejected": -1120.0, "loss": 0.2561, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 4.65625, "rewards/rejected": -9.625, "step": 18820 }, { "epoch": 0.7474743465057658, "grad_norm": 38.13354392442419, "learning_rate": 9.099447945125652e-08, "logits/chosen": -2.765625, "logits/rejected": -3.0, "logps/chosen": -644.0, "logps/rejected": -1176.0, "loss": 0.2211, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 5.34375, "rewards/rejected": -10.0, "step": 18830 }, { "epoch": 0.7478713057975904, "grad_norm": 31.663319422828682, "learning_rate": 9.072729998325112e-08, "logits/chosen": -2.75, "logits/rejected": -2.96875, "logps/chosen": -664.0, "logps/rejected": -1232.0, "loss": 0.2289, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.8125, "rewards/rejected": -10.625, "step": 18840 }, { "epoch": 0.7482682650894151, "grad_norm": 20.093166569468035, "learning_rate": 9.046042635903409e-08, "logits/chosen": -2.875, "logits/rejected": -3.09375, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.2189, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.0, "rewards/rejected": -9.75, "step": 18850 }, { "epoch": 0.7486652243812397, "grad_norm": 31.180057905905993, "learning_rate": 9.019385909107008e-08, "logits/chosen": -2.6875, "logits/rejected": -3.125, "logps/chosen": -652.0, "logps/rejected": -1128.0, "loss": 0.209, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.03125, "rewards/rejected": -9.75, "step": 18860 }, { "epoch": 0.7490621836730643, "grad_norm": 20.105207719123364, "learning_rate": 8.992759869123554e-08, "logits/chosen": -2.625, "logits/rejected": -2.78125, "logps/chosen": -624.0, "logps/rejected": -1144.0, "loss": 0.2047, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.46875, "rewards/margins": 5.28125, "rewards/rejected": -9.75, "step": 18870 }, { "epoch": 0.7494591429648889, "grad_norm": 23.729996779474938, "learning_rate": 8.966164567081752e-08, "logits/chosen": -2.546875, "logits/rejected": -2.84375, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2028, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.34375, "rewards/rejected": -10.0625, "step": 18880 }, { "epoch": 0.7498561022567136, "grad_norm": 26.624932121288392, "learning_rate": 8.939600054051307e-08, "logits/chosen": -2.734375, "logits/rejected": -2.953125, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.2112, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 5.4375, "rewards/rejected": -10.4375, "step": 18890 }, { "epoch": 0.7502530615485382, "grad_norm": 21.88280766331663, "learning_rate": 8.9130663810428e-08, "logits/chosen": -2.734375, "logits/rejected": -3.046875, "logps/chosen": -644.0, "logps/rejected": -1240.0, "loss": 0.1945, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.71875, "rewards/margins": 6.0, "rewards/rejected": -10.6875, "step": 18900 }, { "epoch": 0.7506500208403628, "grad_norm": 19.636785366596683, "learning_rate": 8.886563599007571e-08, "logits/chosen": -2.8125, "logits/rejected": -3.125, "logps/chosen": -656.0, "logps/rejected": -1192.0, "loss": 0.2214, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 5.28125, "rewards/rejected": -10.1875, "step": 18910 }, { "epoch": 0.7510469801321874, "grad_norm": 37.178568276238884, "learning_rate": 8.860091758837649e-08, "logits/chosen": -2.734375, "logits/rejected": -2.90625, "logps/chosen": -672.0, "logps/rejected": -1176.0, "loss": 0.2196, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 18920 }, { "epoch": 0.7514439394240121, "grad_norm": 25.367397199537223, "learning_rate": 8.833650911365643e-08, "logits/chosen": -2.71875, "logits/rejected": -2.9375, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.2194, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 5.09375, "rewards/rejected": -10.25, "step": 18930 }, { "epoch": 0.7518408987158367, "grad_norm": 24.585196145041813, "learning_rate": 8.807241107364661e-08, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.228, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.90625, "rewards/rejected": -9.875, "step": 18940 }, { "epoch": 0.7522378580076613, "grad_norm": 29.763230962310974, "learning_rate": 8.780862397548202e-08, "logits/chosen": -2.6875, "logits/rejected": -2.984375, "logps/chosen": -696.0, "logps/rejected": -1272.0, "loss": 0.2094, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 5.875, "rewards/rejected": -10.9375, "step": 18950 }, { "epoch": 0.7526348172994859, "grad_norm": 26.189141918670497, "learning_rate": 8.754514832570042e-08, "logits/chosen": -2.734375, "logits/rejected": -2.84375, "logps/chosen": -680.0, "logps/rejected": -1232.0, "loss": 0.2449, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 5.59375, "rewards/rejected": -10.5625, "step": 18960 }, { "epoch": 0.7530317765913106, "grad_norm": 20.787444358290525, "learning_rate": 8.728198463024153e-08, "logits/chosen": -2.65625, "logits/rejected": -2.765625, "logps/chosen": -660.0, "logps/rejected": -1128.0, "loss": 0.2147, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 4.875, "rewards/rejected": -9.625, "step": 18970 }, { "epoch": 0.7534287358831352, "grad_norm": 16.50403729530548, "learning_rate": 8.701913339444602e-08, "logits/chosen": -2.765625, "logits/rejected": -2.953125, "logps/chosen": -716.0, "logps/rejected": -1224.0, "loss": 0.1679, "rewards/accuracies": 0.96875, "rewards/chosen": -5.40625, "rewards/margins": 5.125, "rewards/rejected": -10.5, "step": 18980 }, { "epoch": 0.7538256951749598, "grad_norm": 19.670338498796735, "learning_rate": 8.675659512305475e-08, "logits/chosen": -2.8125, "logits/rejected": -2.859375, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2301, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.0625, "rewards/rejected": -10.125, "step": 18990 }, { "epoch": 0.7542226544667844, "grad_norm": 23.320771255488456, "learning_rate": 8.649437032020734e-08, "logits/chosen": -2.640625, "logits/rejected": -2.9375, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.2197, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 19000 }, { "epoch": 0.7542226544667844, "eval_logits/chosen": -2.75, "eval_logits/rejected": -2.96875, "eval_logps/chosen": -708.0, "eval_logps/rejected": -1112.0, "eval_loss": 0.2518906891345978, "eval_rewards/accuracies": 0.8947010636329651, "eval_rewards/chosen": -5.21875, "eval_rewards/margins": 4.21875, "eval_rewards/rejected": -9.4375, "eval_runtime": 5418.398, "eval_samples_per_second": 32.599, "eval_steps_per_second": 0.509, "step": 19000 }, { "epoch": 0.7546196137586091, "grad_norm": 22.338377684534894, "learning_rate": 8.623245948944175e-08, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.2497, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.21875, "rewards/rejected": -10.0, "step": 19010 }, { "epoch": 0.7550165730504337, "grad_norm": 19.828594582506316, "learning_rate": 8.59708631336927e-08, "logits/chosen": -2.71875, "logits/rejected": -3.015625, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.2124, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.75, "rewards/margins": 4.90625, "rewards/rejected": -9.625, "step": 19020 }, { "epoch": 0.7554135323422583, "grad_norm": 34.68366431453179, "learning_rate": 8.570958175529141e-08, "logits/chosen": -2.609375, "logits/rejected": -2.828125, "logps/chosen": -640.0, "logps/rejected": -1176.0, "loss": 0.2272, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.4375, "rewards/rejected": -10.1875, "step": 19030 }, { "epoch": 0.7558104916340829, "grad_norm": 30.992126673138664, "learning_rate": 8.544861585596403e-08, "logits/chosen": -2.859375, "logits/rejected": -3.125, "logps/chosen": -664.0, "logps/rejected": -1176.0, "loss": 0.2022, "rewards/accuracies": 0.96875, "rewards/chosen": -5.09375, "rewards/margins": 5.03125, "rewards/rejected": -10.125, "step": 19040 }, { "epoch": 0.7562074509259076, "grad_norm": 41.747546443544366, "learning_rate": 8.518796593683084e-08, "logits/chosen": -2.859375, "logits/rejected": -3.4375, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2557, "rewards/accuracies": 0.9375, "rewards/chosen": -5.03125, "rewards/margins": 4.875, "rewards/rejected": -9.9375, "step": 19050 }, { "epoch": 0.7566044102177322, "grad_norm": 25.678017253507882, "learning_rate": 8.492763249840565e-08, "logits/chosen": -2.625, "logits/rejected": -2.796875, "logps/chosen": -692.0, "logps/rejected": -1232.0, "loss": 0.1574, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.25, "rewards/rejected": -10.3125, "step": 19060 }, { "epoch": 0.7570013695095568, "grad_norm": 24.043802512182832, "learning_rate": 8.466761604059422e-08, "logits/chosen": -2.8125, "logits/rejected": -3.3125, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.2174, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 19070 }, { "epoch": 0.7573983288013815, "grad_norm": 28.036900878122914, "learning_rate": 8.440791706269392e-08, "logits/chosen": -2.734375, "logits/rejected": -2.890625, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.1992, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.9375, "rewards/rejected": -9.9375, "step": 19080 }, { "epoch": 0.7577952880932061, "grad_norm": 20.76673780854718, "learning_rate": 8.414853606339231e-08, "logits/chosen": -2.78125, "logits/rejected": -3.140625, "logps/chosen": -672.0, "logps/rejected": -1160.0, "loss": 0.2186, "rewards/accuracies": 0.9375, "rewards/chosen": -5.1875, "rewards/margins": 4.875, "rewards/rejected": -10.0625, "step": 19090 }, { "epoch": 0.7581922473850307, "grad_norm": 20.105592884960853, "learning_rate": 8.388947354076625e-08, "logits/chosen": -2.8125, "logits/rejected": -3.03125, "logps/chosen": -692.0, "logps/rejected": -1184.0, "loss": 0.226, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.1875, "rewards/margins": 4.9375, "rewards/rejected": -10.125, "step": 19100 }, { "epoch": 0.7585892066768553, "grad_norm": 24.466993243731874, "learning_rate": 8.363072999228138e-08, "logits/chosen": -2.953125, "logits/rejected": -3.265625, "logps/chosen": -700.0, "logps/rejected": -1192.0, "loss": 0.2388, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.21875, "rewards/margins": 5.03125, "rewards/rejected": -10.25, "step": 19110 }, { "epoch": 0.75898616596868, "grad_norm": 20.373233550304874, "learning_rate": 8.337230591479044e-08, "logits/chosen": -2.71875, "logits/rejected": -2.890625, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.1873, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 19120 }, { "epoch": 0.7593831252605046, "grad_norm": 27.775563182257258, "learning_rate": 8.311420180453304e-08, "logits/chosen": -2.890625, "logits/rejected": -3.0, "logps/chosen": -652.0, "logps/rejected": -1200.0, "loss": 0.2487, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.3125, "rewards/rejected": -10.25, "step": 19130 }, { "epoch": 0.7597800845523291, "grad_norm": 29.235761516469296, "learning_rate": 8.285641815713421e-08, "logits/chosen": -2.9375, "logits/rejected": -3.171875, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.1995, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 19140 }, { "epoch": 0.7601770438441537, "grad_norm": 25.521356206852396, "learning_rate": 8.259895546760346e-08, "logits/chosen": -2.703125, "logits/rejected": -2.796875, "logps/chosen": -732.0, "logps/rejected": -1208.0, "loss": 0.222, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.46875, "rewards/margins": 4.6875, "rewards/rejected": -10.125, "step": 19150 }, { "epoch": 0.7605740031359784, "grad_norm": 21.445086781222805, "learning_rate": 8.234181423033426e-08, "logits/chosen": -2.6875, "logits/rejected": -2.84375, "logps/chosen": -680.0, "logps/rejected": -1184.0, "loss": 0.1943, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 5.09375, "rewards/rejected": -9.9375, "step": 19160 }, { "epoch": 0.760970962427803, "grad_norm": 27.816971799058088, "learning_rate": 8.208499493910278e-08, "logits/chosen": -2.703125, "logits/rejected": -2.78125, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.213, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.28125, "rewards/rejected": -10.125, "step": 19170 }, { "epoch": 0.7613679217196276, "grad_norm": 25.876511778145247, "learning_rate": 8.182849808706682e-08, "logits/chosen": -2.59375, "logits/rejected": -2.828125, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.224, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.3125, "rewards/rejected": -10.0, "step": 19180 }, { "epoch": 0.7617648810114522, "grad_norm": 31.61493380391997, "learning_rate": 8.157232416676505e-08, "logits/chosen": -2.6875, "logits/rejected": -2.796875, "logps/chosen": -720.0, "logps/rejected": -1192.0, "loss": 0.2242, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.21875, "rewards/margins": 4.8125, "rewards/rejected": -10.0625, "step": 19190 }, { "epoch": 0.7621618403032769, "grad_norm": 24.48792746283476, "learning_rate": 8.131647367011599e-08, "logits/chosen": -2.765625, "logits/rejected": -2.734375, "logps/chosen": -656.0, "logps/rejected": -1256.0, "loss": 0.1966, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.84375, "rewards/rejected": -10.6875, "step": 19200 }, { "epoch": 0.7625587995951015, "grad_norm": 44.438610748719746, "learning_rate": 8.106094708841726e-08, "logits/chosen": -2.609375, "logits/rejected": -2.921875, "logps/chosen": -664.0, "logps/rejected": -1128.0, "loss": 0.2355, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.78125, "rewards/rejected": -9.6875, "step": 19210 }, { "epoch": 0.7629557588869261, "grad_norm": 24.103147949309538, "learning_rate": 8.080574491234448e-08, "logits/chosen": -2.796875, "logits/rejected": -3.0625, "logps/chosen": -700.0, "logps/rejected": -1168.0, "loss": 0.2219, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 4.875, "rewards/rejected": -10.0625, "step": 19220 }, { "epoch": 0.7633527181787507, "grad_norm": 23.98824065726538, "learning_rate": 8.055086763195018e-08, "logits/chosen": -2.6875, "logits/rejected": -2.84375, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.2051, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 19230 }, { "epoch": 0.7637496774705754, "grad_norm": 22.090154554117216, "learning_rate": 8.029631573666304e-08, "logits/chosen": -2.78125, "logits/rejected": -2.78125, "logps/chosen": -640.0, "logps/rejected": -1208.0, "loss": 0.2041, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.46875, "rewards/rejected": -10.25, "step": 19240 }, { "epoch": 0.7641466367624, "grad_norm": 26.05790125066934, "learning_rate": 8.004208971528693e-08, "logits/chosen": -2.75, "logits/rejected": -3.015625, "logps/chosen": -688.0, "logps/rejected": -1144.0, "loss": 0.21, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.65625, "rewards/rejected": -9.75, "step": 19250 }, { "epoch": 0.7645435960542246, "grad_norm": 21.452442915952066, "learning_rate": 7.978819005599998e-08, "logits/chosen": -2.765625, "logits/rejected": -2.875, "logps/chosen": -692.0, "logps/rejected": -1200.0, "loss": 0.1854, "rewards/accuracies": 0.96875, "rewards/chosen": -5.03125, "rewards/margins": 5.1875, "rewards/rejected": -10.25, "step": 19260 }, { "epoch": 0.7649405553460492, "grad_norm": 23.67330820238861, "learning_rate": 7.953461724635377e-08, "logits/chosen": -2.609375, "logits/rejected": -2.90625, "logps/chosen": -664.0, "logps/rejected": -1176.0, "loss": 0.2108, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 19270 }, { "epoch": 0.7653375146378739, "grad_norm": 19.704706262376543, "learning_rate": 7.928137177327201e-08, "logits/chosen": -2.796875, "logits/rejected": -3.09375, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2356, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 19280 }, { "epoch": 0.7657344739296985, "grad_norm": 21.850968555246247, "learning_rate": 7.902845412304995e-08, "logits/chosen": -2.640625, "logits/rejected": -2.75, "logps/chosen": -656.0, "logps/rejected": -1224.0, "loss": 0.1991, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.65625, "rewards/rejected": -10.375, "step": 19290 }, { "epoch": 0.7661314332215231, "grad_norm": 28.868333425979486, "learning_rate": 7.87758647813532e-08, "logits/chosen": -2.859375, "logits/rejected": -3.0625, "logps/chosen": -688.0, "logps/rejected": -1208.0, "loss": 0.2093, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.1875, "rewards/margins": 5.09375, "rewards/rejected": -10.3125, "step": 19300 }, { "epoch": 0.7665283925133478, "grad_norm": 24.564839908332996, "learning_rate": 7.852360423321718e-08, "logits/chosen": -2.734375, "logits/rejected": -2.859375, "logps/chosen": -712.0, "logps/rejected": -1200.0, "loss": 0.1968, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.375, "rewards/margins": 4.875, "rewards/rejected": -10.25, "step": 19310 }, { "epoch": 0.7669253518051724, "grad_norm": 30.1099388118389, "learning_rate": 7.827167296304588e-08, "logits/chosen": -2.828125, "logits/rejected": -2.953125, "logps/chosen": -704.0, "logps/rejected": -1120.0, "loss": 0.3025, "rewards/accuracies": 0.90625, "rewards/chosen": -5.375, "rewards/margins": 4.28125, "rewards/rejected": -9.6875, "step": 19320 }, { "epoch": 0.767322311096997, "grad_norm": 24.33222211320605, "learning_rate": 7.802007145461081e-08, "logits/chosen": -2.703125, "logits/rejected": -3.0625, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2188, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.40625, "rewards/rejected": -10.125, "step": 19330 }, { "epoch": 0.7677192703888216, "grad_norm": 20.696466381898468, "learning_rate": 7.776880019105036e-08, "logits/chosen": -2.796875, "logits/rejected": -2.9375, "logps/chosen": -632.0, "logps/rejected": -1152.0, "loss": 0.2386, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.6875, "rewards/margins": 5.15625, "rewards/rejected": -9.8125, "step": 19340 }, { "epoch": 0.7681162296806463, "grad_norm": 28.64080077618775, "learning_rate": 7.751785965486892e-08, "logits/chosen": -2.71875, "logits/rejected": -2.9375, "logps/chosen": -696.0, "logps/rejected": -1176.0, "loss": 0.2245, "rewards/accuracies": 0.96875, "rewards/chosen": -5.125, "rewards/margins": 4.96875, "rewards/rejected": -10.125, "step": 19350 }, { "epoch": 0.7685131889724709, "grad_norm": 23.43216044767199, "learning_rate": 7.72672503279355e-08, "logits/chosen": -2.828125, "logits/rejected": -2.953125, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2591, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.875, "rewards/rejected": -9.8125, "step": 19360 }, { "epoch": 0.7689101482642955, "grad_norm": 26.44647638512168, "learning_rate": 7.701697269148344e-08, "logits/chosen": -2.65625, "logits/rejected": -2.640625, "logps/chosen": -652.0, "logps/rejected": -1192.0, "loss": 0.2198, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.46875, "rewards/rejected": -10.1875, "step": 19370 }, { "epoch": 0.7693071075561201, "grad_norm": 28.73097897944233, "learning_rate": 7.676702722610886e-08, "logits/chosen": -2.8125, "logits/rejected": -2.875, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2319, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.125, "rewards/rejected": -9.9375, "step": 19380 }, { "epoch": 0.7697040668479448, "grad_norm": 25.166334082616167, "learning_rate": 7.651741441177011e-08, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -696.0, "logps/rejected": -1136.0, "loss": 0.2013, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.28125, "rewards/margins": 4.46875, "rewards/rejected": -9.75, "step": 19390 }, { "epoch": 0.7701010261397694, "grad_norm": 35.620379305753424, "learning_rate": 7.626813472778695e-08, "logits/chosen": -2.765625, "logits/rejected": -2.890625, "logps/chosen": -684.0, "logps/rejected": -1232.0, "loss": 0.2434, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 5.5, "rewards/rejected": -10.5, "step": 19400 }, { "epoch": 0.770497985431594, "grad_norm": 30.67910387972699, "learning_rate": 7.601918865283912e-08, "logits/chosen": -2.84375, "logits/rejected": -3.03125, "logps/chosen": -668.0, "logps/rejected": -1184.0, "loss": 0.2117, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.28125, "rewards/rejected": -10.375, "step": 19410 }, { "epoch": 0.7708949447234186, "grad_norm": 30.982647364585635, "learning_rate": 7.577057666496608e-08, "logits/chosen": -2.78125, "logits/rejected": -3.046875, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2217, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.03125, "rewards/rejected": -9.8125, "step": 19420 }, { "epoch": 0.7712919040152433, "grad_norm": 22.049442542148896, "learning_rate": 7.552229924156558e-08, "logits/chosen": -2.84375, "logits/rejected": -3.15625, "logps/chosen": -676.0, "logps/rejected": -1208.0, "loss": 0.1969, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.1875, "rewards/rejected": -10.1875, "step": 19430 }, { "epoch": 0.7716888633070679, "grad_norm": 22.238940673432012, "learning_rate": 7.52743568593928e-08, "logits/chosen": -2.6875, "logits/rejected": -2.75, "logps/chosen": -700.0, "logps/rejected": -1208.0, "loss": 0.2152, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 5.0, "rewards/rejected": -10.125, "step": 19440 }, { "epoch": 0.7720858225988925, "grad_norm": 33.999994266745574, "learning_rate": 7.502674999455989e-08, "logits/chosen": -2.59375, "logits/rejected": -2.984375, "logps/chosen": -656.0, "logps/rejected": -1136.0, "loss": 0.2269, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.84375, "rewards/margins": 4.84375, "rewards/rejected": -9.6875, "step": 19450 }, { "epoch": 0.7724827818907171, "grad_norm": 33.31362875588138, "learning_rate": 7.477947912253435e-08, "logits/chosen": -2.796875, "logits/rejected": -3.078125, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2215, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.25, "rewards/rejected": -10.125, "step": 19460 }, { "epoch": 0.7728797411825418, "grad_norm": 23.13730189705252, "learning_rate": 7.453254471813886e-08, "logits/chosen": -2.859375, "logits/rejected": -2.890625, "logps/chosen": -696.0, "logps/rejected": -1208.0, "loss": 0.2147, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 5.1875, "rewards/rejected": -10.375, "step": 19470 }, { "epoch": 0.7732767004743664, "grad_norm": 34.175816935510476, "learning_rate": 7.428594725554971e-08, "logits/chosen": -2.90625, "logits/rejected": -3.21875, "logps/chosen": -696.0, "logps/rejected": -1176.0, "loss": 0.2046, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.34375, "rewards/margins": 4.8125, "rewards/rejected": -10.125, "step": 19480 }, { "epoch": 0.773673659766191, "grad_norm": 30.750266408969857, "learning_rate": 7.40396872082962e-08, "logits/chosen": -2.796875, "logits/rejected": -3.015625, "logps/chosen": -684.0, "logps/rejected": -1176.0, "loss": 0.2034, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.1875, "rewards/margins": 4.96875, "rewards/rejected": -10.125, "step": 19490 }, { "epoch": 0.7740706190580156, "grad_norm": 25.229592876363196, "learning_rate": 7.379376504925994e-08, "logits/chosen": -2.84375, "logits/rejected": -3.203125, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.2264, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.15625, "rewards/rejected": -9.9375, "step": 19500 }, { "epoch": 0.7744675783498403, "grad_norm": 39.305088958494174, "learning_rate": 7.354818125067347e-08, "logits/chosen": -2.84375, "logits/rejected": -2.96875, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.2324, "rewards/accuracies": 0.9375, "rewards/chosen": -4.90625, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 19510 }, { "epoch": 0.7748645376416649, "grad_norm": 30.042993594851865, "learning_rate": 7.33029362841196e-08, "logits/chosen": -2.84375, "logits/rejected": -2.90625, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.2177, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 4.6875, "rewards/rejected": -9.875, "step": 19520 }, { "epoch": 0.7752614969334894, "grad_norm": 25.261783403549643, "learning_rate": 7.305803062053068e-08, "logits/chosen": -2.75, "logits/rejected": -2.9375, "logps/chosen": -656.0, "logps/rejected": -1152.0, "loss": 0.254, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.15625, "rewards/rejected": -10.0, "step": 19530 }, { "epoch": 0.775658456225314, "grad_norm": 24.01163442740993, "learning_rate": 7.281346473018743e-08, "logits/chosen": -2.734375, "logits/rejected": -2.890625, "logps/chosen": -672.0, "logps/rejected": -1176.0, "loss": 0.1964, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 5.0625, "rewards/rejected": -10.0625, "step": 19540 }, { "epoch": 0.7760554155171387, "grad_norm": 19.155237353898222, "learning_rate": 7.256923908271803e-08, "logits/chosen": -2.859375, "logits/rejected": -2.921875, "logps/chosen": -656.0, "logps/rejected": -1176.0, "loss": 0.2129, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 5.03125, "rewards/rejected": -10.0, "step": 19550 }, { "epoch": 0.7764523748089633, "grad_norm": 24.61982526447935, "learning_rate": 7.232535414709739e-08, "logits/chosen": -2.6875, "logits/rejected": -2.71875, "logps/chosen": -676.0, "logps/rejected": -1192.0, "loss": 0.2262, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.28125, "rewards/rejected": -10.25, "step": 19560 }, { "epoch": 0.7768493341007879, "grad_norm": 28.288055018158897, "learning_rate": 7.208181039164607e-08, "logits/chosen": -2.71875, "logits/rejected": -2.828125, "logps/chosen": -688.0, "logps/rejected": -1200.0, "loss": 0.2061, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.125, "rewards/margins": 5.03125, "rewards/rejected": -10.125, "step": 19570 }, { "epoch": 0.7772462933926126, "grad_norm": 21.63520111582934, "learning_rate": 7.183860828402962e-08, "logits/chosen": -2.75, "logits/rejected": -2.9375, "logps/chosen": -700.0, "logps/rejected": -1200.0, "loss": 0.1976, "rewards/accuracies": 0.96875, "rewards/chosen": -5.21875, "rewards/margins": 5.03125, "rewards/rejected": -10.25, "step": 19580 }, { "epoch": 0.7776432526844372, "grad_norm": 29.844897974866107, "learning_rate": 7.159574829125759e-08, "logits/chosen": -2.734375, "logits/rejected": -3.078125, "logps/chosen": -688.0, "logps/rejected": -1136.0, "loss": 0.1929, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 4.53125, "rewards/rejected": -9.6875, "step": 19590 }, { "epoch": 0.7780402119762618, "grad_norm": 28.67692942647004, "learning_rate": 7.135323087968231e-08, "logits/chosen": -2.90625, "logits/rejected": -3.078125, "logps/chosen": -672.0, "logps/rejected": -1208.0, "loss": 0.1932, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.25, "rewards/rejected": -10.3125, "step": 19600 }, { "epoch": 0.7784371712680864, "grad_norm": 38.01343397512374, "learning_rate": 7.111105651499852e-08, "logits/chosen": -2.734375, "logits/rejected": -2.96875, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2358, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 4.78125, "rewards/rejected": -9.6875, "step": 19610 }, { "epoch": 0.7788341305599111, "grad_norm": 27.563633027769214, "learning_rate": 7.086922566224198e-08, "logits/chosen": -2.65625, "logits/rejected": -2.875, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2023, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 5.0, "rewards/rejected": -10.0625, "step": 19620 }, { "epoch": 0.7792310898517357, "grad_norm": 19.305703792069107, "learning_rate": 7.062773878578912e-08, "logits/chosen": -2.890625, "logits/rejected": -3.1875, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.2479, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.25, "rewards/rejected": -10.1875, "step": 19630 }, { "epoch": 0.7796280491435603, "grad_norm": 26.238132053009707, "learning_rate": 7.03865963493557e-08, "logits/chosen": -2.796875, "logits/rejected": -3.078125, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.219, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 19640 }, { "epoch": 0.7800250084353849, "grad_norm": 39.687467781130735, "learning_rate": 7.014579881599605e-08, "logits/chosen": -2.703125, "logits/rejected": -2.84375, "logps/chosen": -716.0, "logps/rejected": -1200.0, "loss": 0.2355, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.375, "rewards/margins": 4.9375, "rewards/rejected": -10.3125, "step": 19650 }, { "epoch": 0.7804219677272096, "grad_norm": 27.414704280101983, "learning_rate": 6.990534664810222e-08, "logits/chosen": -2.796875, "logits/rejected": -2.84375, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.2308, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.25, "rewards/rejected": -10.0625, "step": 19660 }, { "epoch": 0.7808189270190342, "grad_norm": 32.06609568960261, "learning_rate": 6.966524030740287e-08, "logits/chosen": -2.625, "logits/rejected": -2.875, "logps/chosen": -732.0, "logps/rejected": -1200.0, "loss": 0.2226, "rewards/accuracies": 0.9375, "rewards/chosen": -5.4375, "rewards/margins": 4.65625, "rewards/rejected": -10.0625, "step": 19670 }, { "epoch": 0.7812158863108588, "grad_norm": 24.681375486891127, "learning_rate": 6.942548025496312e-08, "logits/chosen": -2.84375, "logits/rejected": -3.015625, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.237, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 5.125, "rewards/rejected": -10.125, "step": 19680 }, { "epoch": 0.7816128456026834, "grad_norm": 18.250640292044366, "learning_rate": 6.918606695118262e-08, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -696.0, "logps/rejected": -1160.0, "loss": 0.2529, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.75, "rewards/rejected": -9.8125, "step": 19690 }, { "epoch": 0.7820098048945081, "grad_norm": 22.7068053896686, "learning_rate": 6.894700085579536e-08, "logits/chosen": -2.734375, "logits/rejected": -2.921875, "logps/chosen": -700.0, "logps/rejected": -1184.0, "loss": 0.2231, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.15625, "rewards/margins": 5.0625, "rewards/rejected": -10.25, "step": 19700 }, { "epoch": 0.7824067641863327, "grad_norm": 29.832569801384814, "learning_rate": 6.87082824278685e-08, "logits/chosen": -2.828125, "logits/rejected": -3.078125, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.2009, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.0625, "rewards/rejected": -10.125, "step": 19710 }, { "epoch": 0.7828037234781573, "grad_norm": 42.01524500858085, "learning_rate": 6.846991212580177e-08, "logits/chosen": -2.84375, "logits/rejected": -3.109375, "logps/chosen": -644.0, "logps/rejected": -1120.0, "loss": 0.2595, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 4.8125, "rewards/rejected": -9.6875, "step": 19720 }, { "epoch": 0.7832006827699819, "grad_norm": 29.095198312894244, "learning_rate": 6.823189040732641e-08, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -708.0, "logps/rejected": -1176.0, "loss": 0.2306, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.25, "rewards/margins": 4.8125, "rewards/rejected": -10.0625, "step": 19730 }, { "epoch": 0.7835976420618066, "grad_norm": 41.32520194450729, "learning_rate": 6.79942177295041e-08, "logits/chosen": -2.625, "logits/rejected": -2.796875, "logps/chosen": -672.0, "logps/rejected": -1176.0, "loss": 0.2215, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.3125, "rewards/rejected": -10.0, "step": 19740 }, { "epoch": 0.7839946013536312, "grad_norm": 20.63135351439687, "learning_rate": 6.775689454872638e-08, "logits/chosen": -2.75, "logits/rejected": -2.875, "logps/chosen": -700.0, "logps/rejected": -1200.0, "loss": 0.2008, "rewards/accuracies": 0.96875, "rewards/chosen": -5.03125, "rewards/margins": 5.1875, "rewards/rejected": -10.25, "step": 19750 }, { "epoch": 0.7843915606454558, "grad_norm": 20.32397765017929, "learning_rate": 6.75199213207136e-08, "logits/chosen": -2.859375, "logits/rejected": -3.25, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2099, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.25, "rewards/rejected": -10.25, "step": 19760 }, { "epoch": 0.7847885199372804, "grad_norm": 17.55483709083684, "learning_rate": 6.728329850051423e-08, "logits/chosen": -2.828125, "logits/rejected": -2.9375, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.2091, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.25, "rewards/rejected": -10.25, "step": 19770 }, { "epoch": 0.7851854792291051, "grad_norm": 32.021992088788416, "learning_rate": 6.704702654250391e-08, "logits/chosen": -2.75, "logits/rejected": -3.09375, "logps/chosen": -664.0, "logps/rejected": -1216.0, "loss": 0.2306, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.59375, "rewards/rejected": -10.5625, "step": 19780 }, { "epoch": 0.7855824385209297, "grad_norm": 14.75354874782081, "learning_rate": 6.681110590038436e-08, "logits/chosen": -2.71875, "logits/rejected": -2.796875, "logps/chosen": -700.0, "logps/rejected": -1272.0, "loss": 0.2151, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.5625, "rewards/rejected": -10.625, "step": 19790 }, { "epoch": 0.7859793978127543, "grad_norm": 29.472167773359978, "learning_rate": 6.657553702718274e-08, "logits/chosen": -2.765625, "logits/rejected": -2.953125, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.2488, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.25, "rewards/rejected": -10.25, "step": 19800 }, { "epoch": 0.7863763571045789, "grad_norm": 22.840836000025107, "learning_rate": 6.634032037525073e-08, "logits/chosen": -2.703125, "logits/rejected": -2.828125, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.2307, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.15625, "rewards/margins": 4.84375, "rewards/rejected": -10.0, "step": 19810 }, { "epoch": 0.7867733163964036, "grad_norm": 33.104059888552484, "learning_rate": 6.610545639626378e-08, "logits/chosen": -2.875, "logits/rejected": -3.046875, "logps/chosen": -636.0, "logps/rejected": -1192.0, "loss": 0.2269, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.65625, "rewards/margins": 5.46875, "rewards/rejected": -10.125, "step": 19820 }, { "epoch": 0.7871702756882282, "grad_norm": 30.38463118039003, "learning_rate": 6.587094554121986e-08, "logits/chosen": -2.65625, "logits/rejected": -2.875, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2334, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.875, "rewards/margins": 5.28125, "rewards/rejected": -10.125, "step": 19830 }, { "epoch": 0.7875672349800528, "grad_norm": 30.465943786044107, "learning_rate": 6.56367882604392e-08, "logits/chosen": -2.875, "logits/rejected": -3.078125, "logps/chosen": -632.0, "logps/rejected": -1168.0, "loss": 0.2027, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.34375, "rewards/rejected": -10.0625, "step": 19840 }, { "epoch": 0.7879641942718775, "grad_norm": 22.486081324721543, "learning_rate": 6.540298500356278e-08, "logits/chosen": -2.71875, "logits/rejected": -2.9375, "logps/chosen": -704.0, "logps/rejected": -1232.0, "loss": 0.212, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.25, "rewards/margins": 5.34375, "rewards/rejected": -10.625, "step": 19850 }, { "epoch": 0.7883611535637021, "grad_norm": 33.48940534784092, "learning_rate": 6.516953621955179e-08, "logits/chosen": -2.625, "logits/rejected": -2.71875, "logps/chosen": -716.0, "logps/rejected": -1224.0, "loss": 0.2194, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 5.375, "rewards/rejected": -10.5625, "step": 19860 }, { "epoch": 0.7887581128555267, "grad_norm": 25.45049682684766, "learning_rate": 6.4936442356687e-08, "logits/chosen": -2.515625, "logits/rejected": -2.78125, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.2337, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.0625, "rewards/rejected": -9.875, "step": 19870 }, { "epoch": 0.7891550721473513, "grad_norm": 18.63838934505744, "learning_rate": 6.47037038625673e-08, "logits/chosen": -2.640625, "logits/rejected": -3.015625, "logps/chosen": -640.0, "logps/rejected": -1168.0, "loss": 0.1857, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.28125, "rewards/rejected": -10.0, "step": 19880 }, { "epoch": 0.789552031439176, "grad_norm": 25.829631022548558, "learning_rate": 6.44713211841095e-08, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -676.0, "logps/rejected": -1152.0, "loss": 0.2258, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.71875, "rewards/rejected": -9.75, "step": 19890 }, { "epoch": 0.7899489907310006, "grad_norm": 24.259046909647857, "learning_rate": 6.423929476754686e-08, "logits/chosen": -2.6875, "logits/rejected": -2.921875, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.2538, "rewards/accuracies": 0.9375, "rewards/chosen": -4.8125, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 19900 }, { "epoch": 0.7903459500228251, "grad_norm": 29.16793926690572, "learning_rate": 6.400762505842877e-08, "logits/chosen": -2.71875, "logits/rejected": -3.0, "logps/chosen": -668.0, "logps/rejected": -1120.0, "loss": 0.1773, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 4.71875, "rewards/rejected": -9.5, "step": 19910 }, { "epoch": 0.7907429093146497, "grad_norm": 24.69413226655701, "learning_rate": 6.377631250161955e-08, "logits/chosen": -2.78125, "logits/rejected": -3.046875, "logps/chosen": -624.0, "logps/rejected": -1168.0, "loss": 0.2061, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.625, "rewards/margins": 5.4375, "rewards/rejected": -10.0625, "step": 19920 }, { "epoch": 0.7911398686064745, "grad_norm": 29.894468938339013, "learning_rate": 6.354535754129759e-08, "logits/chosen": -2.59375, "logits/rejected": -2.796875, "logps/chosen": -656.0, "logps/rejected": -1176.0, "loss": 0.2121, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.21875, "rewards/rejected": -10.0625, "step": 19930 }, { "epoch": 0.791536827898299, "grad_norm": 24.17323289842383, "learning_rate": 6.331476062095484e-08, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -652.0, "logps/rejected": -1176.0, "loss": 0.2101, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.09375, "rewards/rejected": -9.9375, "step": 19940 }, { "epoch": 0.7919337871901236, "grad_norm": 23.026786695806738, "learning_rate": 6.308452218339547e-08, "logits/chosen": -2.703125, "logits/rejected": -2.65625, "logps/chosen": -676.0, "logps/rejected": -1216.0, "loss": 0.2198, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.03125, "rewards/margins": 5.40625, "rewards/rejected": -10.4375, "step": 19950 }, { "epoch": 0.7923307464819482, "grad_norm": 18.909738369580513, "learning_rate": 6.285464267073553e-08, "logits/chosen": -2.484375, "logits/rejected": -2.65625, "logps/chosen": -652.0, "logps/rejected": -1200.0, "loss": 0.2122, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.59375, "rewards/rejected": -10.3125, "step": 19960 }, { "epoch": 0.7927277057737729, "grad_norm": 21.012721508288408, "learning_rate": 6.26251225244017e-08, "logits/chosen": -2.828125, "logits/rejected": -3.0625, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.2033, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 19970 }, { "epoch": 0.7931246650655975, "grad_norm": 38.95262408425272, "learning_rate": 6.239596218513041e-08, "logits/chosen": -2.75, "logits/rejected": -2.84375, "logps/chosen": -680.0, "logps/rejected": -1184.0, "loss": 0.2296, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.125, "rewards/margins": 4.96875, "rewards/rejected": -10.125, "step": 19980 }, { "epoch": 0.7935216243574221, "grad_norm": 34.817064977662284, "learning_rate": 6.21671620929676e-08, "logits/chosen": -2.546875, "logits/rejected": -3.0, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.1969, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 5.375, "rewards/rejected": -10.125, "step": 19990 }, { "epoch": 0.7939185836492467, "grad_norm": 30.14393581288189, "learning_rate": 6.193872268726702e-08, "logits/chosen": -2.703125, "logits/rejected": -3.0, "logps/chosen": -692.0, "logps/rejected": -1208.0, "loss": 0.213, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 5.3125, "rewards/rejected": -10.375, "step": 20000 }, { "epoch": 0.7939185836492467, "eval_logits/chosen": -2.71875, "eval_logits/rejected": -2.9375, "eval_logps/chosen": -712.0, "eval_logps/rejected": -1120.0, "eval_loss": 0.25033268332481384, "eval_rewards/accuracies": 0.8946105241775513, "eval_rewards/chosen": -5.28125, "eval_rewards/margins": 4.21875, "eval_rewards/rejected": -9.5, "eval_runtime": 5412.4994, "eval_samples_per_second": 32.635, "eval_steps_per_second": 0.51, "step": 20000 }, { "epoch": 0.7943155429410714, "grad_norm": 25.366710024014665, "learning_rate": 6.171064440669013e-08, "logits/chosen": -2.609375, "logits/rejected": -2.8125, "logps/chosen": -664.0, "logps/rejected": -1216.0, "loss": 0.2365, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 5.53125, "rewards/rejected": -10.4375, "step": 20010 }, { "epoch": 0.794712502232896, "grad_norm": 34.556973232520065, "learning_rate": 6.148292768920474e-08, "logits/chosen": -2.625, "logits/rejected": -2.875, "logps/chosen": -668.0, "logps/rejected": -1144.0, "loss": 0.2285, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0, "rewards/margins": 4.8125, "rewards/rejected": -9.8125, "step": 20020 }, { "epoch": 0.7951094615247206, "grad_norm": 23.594203633411603, "learning_rate": 6.125557297208436e-08, "logits/chosen": -2.5625, "logits/rejected": -2.859375, "logps/chosen": -676.0, "logps/rejected": -1192.0, "loss": 0.2339, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 20030 }, { "epoch": 0.7955064208165452, "grad_norm": 33.681256024295976, "learning_rate": 6.102858069190753e-08, "logits/chosen": -2.609375, "logits/rejected": -3.015625, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.2133, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 20040 }, { "epoch": 0.7959033801083699, "grad_norm": 19.061024527734624, "learning_rate": 6.080195128455673e-08, "logits/chosen": -2.6875, "logits/rejected": -3.046875, "logps/chosen": -680.0, "logps/rejected": -1160.0, "loss": 0.2094, "rewards/accuracies": 0.96875, "rewards/chosen": -5.03125, "rewards/margins": 5.0625, "rewards/rejected": -10.0625, "step": 20050 }, { "epoch": 0.7963003394001945, "grad_norm": 31.67297587575877, "learning_rate": 6.057568518521763e-08, "logits/chosen": -2.703125, "logits/rejected": -3.015625, "logps/chosen": -676.0, "logps/rejected": -1144.0, "loss": 0.2342, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.84375, "rewards/rejected": -9.75, "step": 20060 }, { "epoch": 0.7966972986920191, "grad_norm": 30.692386787829957, "learning_rate": 6.034978282837821e-08, "logits/chosen": -2.78125, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1192.0, "loss": 0.1922, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 5.5, "rewards/rejected": -10.1875, "step": 20070 }, { "epoch": 0.7970942579838437, "grad_norm": 25.977347437853208, "learning_rate": 6.012424464782798e-08, "logits/chosen": -2.671875, "logits/rejected": -2.984375, "logps/chosen": -684.0, "logps/rejected": -1216.0, "loss": 0.216, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.375, "rewards/rejected": -10.4375, "step": 20080 }, { "epoch": 0.7974912172756684, "grad_norm": 26.482250273521, "learning_rate": 5.98990710766572e-08, "logits/chosen": -2.703125, "logits/rejected": -2.9375, "logps/chosen": -644.0, "logps/rejected": -1160.0, "loss": 0.1861, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.28125, "rewards/rejected": -10.0, "step": 20090 }, { "epoch": 0.797888176567493, "grad_norm": 33.04154482131778, "learning_rate": 5.967426254725611e-08, "logits/chosen": -2.703125, "logits/rejected": -2.90625, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.1997, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.0, "rewards/rejected": -9.8125, "step": 20100 }, { "epoch": 0.7982851358593176, "grad_norm": 20.134968116343153, "learning_rate": 5.944981949131367e-08, "logits/chosen": -2.71875, "logits/rejected": -2.9375, "logps/chosen": -644.0, "logps/rejected": -1136.0, "loss": 0.2174, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.71875, "rewards/margins": 4.96875, "rewards/rejected": -9.6875, "step": 20110 }, { "epoch": 0.7986820951511423, "grad_norm": 40.03110530493364, "learning_rate": 5.922574233981728e-08, "logits/chosen": -2.78125, "logits/rejected": -3.03125, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2108, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 20120 }, { "epoch": 0.7990790544429669, "grad_norm": 16.923830002676066, "learning_rate": 5.9002031523051555e-08, "logits/chosen": -2.625, "logits/rejected": -2.75, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.1793, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 4.90625, "rewards/rejected": -9.75, "step": 20130 }, { "epoch": 0.7994760137347915, "grad_norm": 27.194749058791704, "learning_rate": 5.877868747059775e-08, "logits/chosen": -2.78125, "logits/rejected": -3.09375, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2152, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 20140 }, { "epoch": 0.7998729730266161, "grad_norm": 23.55082798041504, "learning_rate": 5.855571061133294e-08, "logits/chosen": -2.765625, "logits/rejected": -3.125, "logps/chosen": -676.0, "logps/rejected": -1184.0, "loss": 0.1995, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.40625, "rewards/rejected": -10.3125, "step": 20150 }, { "epoch": 0.8002699323184408, "grad_norm": 36.051144988193954, "learning_rate": 5.833310137342892e-08, "logits/chosen": -2.65625, "logits/rejected": -2.984375, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2361, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.0625, "rewards/rejected": -9.875, "step": 20160 }, { "epoch": 0.8006668916102654, "grad_norm": 32.5471340443208, "learning_rate": 5.811086018435157e-08, "logits/chosen": -2.71875, "logits/rejected": -2.890625, "logps/chosen": -680.0, "logps/rejected": -1144.0, "loss": 0.2413, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.625, "rewards/rejected": -9.6875, "step": 20170 }, { "epoch": 0.80106385090209, "grad_norm": 24.469267836209404, "learning_rate": 5.788898747086002e-08, "logits/chosen": -2.75, "logits/rejected": -2.609375, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.244, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 4.875, "rewards/rejected": -9.9375, "step": 20180 }, { "epoch": 0.8014608101939146, "grad_norm": 16.473747381906488, "learning_rate": 5.766748365900598e-08, "logits/chosen": -2.8125, "logits/rejected": -3.046875, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.1911, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.96875, "rewards/rejected": -9.9375, "step": 20190 }, { "epoch": 0.8018577694857393, "grad_norm": 35.381094034879624, "learning_rate": 5.744634917413274e-08, "logits/chosen": -2.703125, "logits/rejected": -2.984375, "logps/chosen": -648.0, "logps/rejected": -1200.0, "loss": 0.2479, "rewards/accuracies": 0.9375, "rewards/chosen": -4.875, "rewards/margins": 5.4375, "rewards/rejected": -10.3125, "step": 20200 }, { "epoch": 0.8022547287775639, "grad_norm": 29.104131669699605, "learning_rate": 5.722558444087425e-08, "logits/chosen": -2.78125, "logits/rejected": -2.875, "logps/chosen": -688.0, "logps/rejected": -1184.0, "loss": 0.2453, "rewards/accuracies": 0.96875, "rewards/chosen": -5.1875, "rewards/margins": 4.84375, "rewards/rejected": -10.0, "step": 20210 }, { "epoch": 0.8026516880693885, "grad_norm": 30.23052062066906, "learning_rate": 5.700518988315447e-08, "logits/chosen": -2.75, "logits/rejected": -2.8125, "logps/chosen": -696.0, "logps/rejected": -1184.0, "loss": 0.2194, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.21875, "rewards/margins": 4.78125, "rewards/rejected": -10.0, "step": 20220 }, { "epoch": 0.8030486473612131, "grad_norm": 22.1385297444737, "learning_rate": 5.6785165924186695e-08, "logits/chosen": -2.703125, "logits/rejected": -3.015625, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.195, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 20230 }, { "epoch": 0.8034456066530378, "grad_norm": 30.974154835848925, "learning_rate": 5.656551298647236e-08, "logits/chosen": -2.703125, "logits/rejected": -3.046875, "logps/chosen": -628.0, "logps/rejected": -1144.0, "loss": 0.2019, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.125, "rewards/rejected": -9.8125, "step": 20240 }, { "epoch": 0.8038425659448624, "grad_norm": 20.406482409651435, "learning_rate": 5.6346231491800675e-08, "logits/chosen": -2.578125, "logits/rejected": -3.0, "logps/chosen": -640.0, "logps/rejected": -1192.0, "loss": 0.2077, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.40625, "rewards/rejected": -10.125, "step": 20250 }, { "epoch": 0.804239525236687, "grad_norm": 26.982796085322935, "learning_rate": 5.612732186124738e-08, "logits/chosen": -2.5625, "logits/rejected": -2.71875, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.2219, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 20260 }, { "epoch": 0.8046364845285116, "grad_norm": 22.84645825423406, "learning_rate": 5.5908784515174165e-08, "logits/chosen": -2.6875, "logits/rejected": -2.96875, "logps/chosen": -692.0, "logps/rejected": -1160.0, "loss": 0.2335, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 20270 }, { "epoch": 0.8050334438203363, "grad_norm": 29.999447984546034, "learning_rate": 5.569061987322807e-08, "logits/chosen": -2.75, "logits/rejected": -3.0625, "logps/chosen": -672.0, "logps/rejected": -1232.0, "loss": 0.2053, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.71875, "rewards/rejected": -10.6875, "step": 20280 }, { "epoch": 0.8054304031121609, "grad_norm": 27.50287800295026, "learning_rate": 5.547282835434008e-08, "logits/chosen": -2.578125, "logits/rejected": -2.796875, "logps/chosen": -684.0, "logps/rejected": -1200.0, "loss": 0.222, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.3125, "rewards/rejected": -10.25, "step": 20290 }, { "epoch": 0.8058273624039854, "grad_norm": 26.32568348218991, "learning_rate": 5.525541037672504e-08, "logits/chosen": -2.8125, "logits/rejected": -2.796875, "logps/chosen": -692.0, "logps/rejected": -1160.0, "loss": 0.2176, "rewards/accuracies": 0.9375, "rewards/chosen": -5.375, "rewards/margins": 4.5, "rewards/rejected": -9.875, "step": 20300 }, { "epoch": 0.80622432169581, "grad_norm": 27.1912460582102, "learning_rate": 5.503836635788031e-08, "logits/chosen": -2.703125, "logits/rejected": -2.859375, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.1995, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.0625, "rewards/rejected": -10.0625, "step": 20310 }, { "epoch": 0.8066212809876347, "grad_norm": 35.84935857707108, "learning_rate": 5.4821696714585114e-08, "logits/chosen": -2.6875, "logits/rejected": -2.796875, "logps/chosen": -676.0, "logps/rejected": -1248.0, "loss": 0.2101, "rewards/accuracies": 0.96875, "rewards/chosen": -4.9375, "rewards/margins": 5.78125, "rewards/rejected": -10.75, "step": 20320 }, { "epoch": 0.8070182402794593, "grad_norm": 23.008705401131174, "learning_rate": 5.4605401862899996e-08, "logits/chosen": -2.859375, "logits/rejected": -3.03125, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.2227, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 20330 }, { "epoch": 0.8074151995712839, "grad_norm": 23.825436647042707, "learning_rate": 5.4389482218165584e-08, "logits/chosen": -2.6875, "logits/rejected": -2.734375, "logps/chosen": -644.0, "logps/rejected": -1176.0, "loss": 0.221, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 20340 }, { "epoch": 0.8078121588631086, "grad_norm": 28.6426452272508, "learning_rate": 5.4173938195002045e-08, "logits/chosen": -2.75, "logits/rejected": -2.8125, "logps/chosen": -704.0, "logps/rejected": -1168.0, "loss": 0.2408, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.3125, "rewards/margins": 4.5, "rewards/rejected": -9.8125, "step": 20350 }, { "epoch": 0.8082091181549332, "grad_norm": 26.942341893413506, "learning_rate": 5.3958770207308435e-08, "logits/chosen": -2.75, "logits/rejected": -3.078125, "logps/chosen": -676.0, "logps/rejected": -1216.0, "loss": 0.2082, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.4375, "rewards/rejected": -10.5, "step": 20360 }, { "epoch": 0.8086060774467578, "grad_norm": 28.67374091944667, "learning_rate": 5.3743978668261496e-08, "logits/chosen": -2.703125, "logits/rejected": -2.8125, "logps/chosen": -656.0, "logps/rejected": -1208.0, "loss": 0.2393, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.53125, "rewards/rejected": -10.375, "step": 20370 }, { "epoch": 0.8090030367385824, "grad_norm": 29.193423942369066, "learning_rate": 5.35295639903153e-08, "logits/chosen": -2.59375, "logits/rejected": -3.03125, "logps/chosen": -696.0, "logps/rejected": -1240.0, "loss": 0.2153, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 5.59375, "rewards/rejected": -10.6875, "step": 20380 }, { "epoch": 0.8093999960304071, "grad_norm": 53.6189034448816, "learning_rate": 5.331552658520011e-08, "logits/chosen": -2.703125, "logits/rejected": -2.953125, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.2356, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.03125, "rewards/rejected": -9.9375, "step": 20390 }, { "epoch": 0.8097969553222317, "grad_norm": 21.687073148426215, "learning_rate": 5.3101866863921704e-08, "logits/chosen": -2.703125, "logits/rejected": -2.859375, "logps/chosen": -684.0, "logps/rejected": -1184.0, "loss": 0.2123, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 5.09375, "rewards/rejected": -10.1875, "step": 20400 }, { "epoch": 0.8101939146140563, "grad_norm": 30.262312571159832, "learning_rate": 5.288858523676074e-08, "logits/chosen": -2.78125, "logits/rejected": -3.0, "logps/chosen": -692.0, "logps/rejected": -1152.0, "loss": 0.2427, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.25, "rewards/margins": 4.75, "rewards/rejected": -10.0, "step": 20410 }, { "epoch": 0.8105908739058809, "grad_norm": 17.325495044618926, "learning_rate": 5.267568211327186e-08, "logits/chosen": -2.6875, "logits/rejected": -3.125, "logps/chosen": -676.0, "logps/rejected": -1184.0, "loss": 0.2031, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 20420 }, { "epoch": 0.8109878331977056, "grad_norm": 27.54746826781512, "learning_rate": 5.24631579022827e-08, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -688.0, "logps/rejected": -1216.0, "loss": 0.1971, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 20430 }, { "epoch": 0.8113847924895302, "grad_norm": 24.365498325939885, "learning_rate": 5.2251013011893444e-08, "logits/chosen": -2.625, "logits/rejected": -2.71875, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.1948, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 20440 }, { "epoch": 0.8117817517813548, "grad_norm": 27.92855188553194, "learning_rate": 5.203924784947572e-08, "logits/chosen": -2.625, "logits/rejected": -3.078125, "logps/chosen": -656.0, "logps/rejected": -1192.0, "loss": 0.1831, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.53125, "rewards/rejected": -10.375, "step": 20450 }, { "epoch": 0.8121787110731794, "grad_norm": 17.27846510378424, "learning_rate": 5.1827862821672174e-08, "logits/chosen": -2.796875, "logits/rejected": -2.96875, "logps/chosen": -676.0, "logps/rejected": -1176.0, "loss": 0.1984, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.0, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 20460 }, { "epoch": 0.8125756703650041, "grad_norm": 29.134737942021324, "learning_rate": 5.16168583343955e-08, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -684.0, "logps/rejected": -1208.0, "loss": 0.2072, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 5.34375, "rewards/rejected": -10.375, "step": 20470 }, { "epoch": 0.8129726296568287, "grad_norm": 33.52502209387549, "learning_rate": 5.140623479282749e-08, "logits/chosen": -2.640625, "logits/rejected": -2.78125, "logps/chosen": -696.0, "logps/rejected": -1208.0, "loss": 0.2225, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.375, "rewards/rejected": -10.375, "step": 20480 }, { "epoch": 0.8133695889486533, "grad_norm": 29.39468778333962, "learning_rate": 5.119599260141852e-08, "logits/chosen": -2.734375, "logits/rejected": -3.265625, "logps/chosen": -680.0, "logps/rejected": -1184.0, "loss": 0.2127, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 5.15625, "rewards/rejected": -10.25, "step": 20490 }, { "epoch": 0.8137665482404779, "grad_norm": 21.733530825129012, "learning_rate": 5.098613216388659e-08, "logits/chosen": -2.578125, "logits/rejected": -2.703125, "logps/chosen": -696.0, "logps/rejected": -1232.0, "loss": 0.1963, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.3125, "rewards/rejected": -10.25, "step": 20500 }, { "epoch": 0.8141635075323026, "grad_norm": 30.498578483350663, "learning_rate": 5.077665388321681e-08, "logits/chosen": -2.609375, "logits/rejected": -3.0, "logps/chosen": -692.0, "logps/rejected": -1200.0, "loss": 0.2181, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 20510 }, { "epoch": 0.8145604668241272, "grad_norm": 24.442838710404022, "learning_rate": 5.056755816166039e-08, "logits/chosen": -2.6875, "logits/rejected": -2.8125, "logps/chosen": -680.0, "logps/rejected": -1216.0, "loss": 0.1977, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.09375, "rewards/margins": 5.34375, "rewards/rejected": -10.4375, "step": 20520 }, { "epoch": 0.8149574261159518, "grad_norm": 40.067956932629585, "learning_rate": 5.0358845400733817e-08, "logits/chosen": -2.625, "logits/rejected": -2.796875, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.1978, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 5.25, "rewards/rejected": -10.3125, "step": 20530 }, { "epoch": 0.8153543854077764, "grad_norm": 31.96864630477011, "learning_rate": 5.015051600121833e-08, "logits/chosen": -2.671875, "logits/rejected": -3.015625, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.2522, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.0625, "rewards/rejected": -9.9375, "step": 20540 }, { "epoch": 0.8157513446996011, "grad_norm": 39.910892178334656, "learning_rate": 4.9942570363158826e-08, "logits/chosen": -2.84375, "logits/rejected": -3.0625, "logps/chosen": -712.0, "logps/rejected": -1176.0, "loss": 0.2074, "rewards/accuracies": 0.9375, "rewards/chosen": -5.28125, "rewards/margins": 4.71875, "rewards/rejected": -10.0, "step": 20550 }, { "epoch": 0.8161483039914257, "grad_norm": 24.466481502095526, "learning_rate": 4.973500888586363e-08, "logits/chosen": -2.8125, "logits/rejected": -3.109375, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.2255, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.15625, "rewards/margins": 5.0625, "rewards/rejected": -10.1875, "step": 20560 }, { "epoch": 0.8165452632832503, "grad_norm": 35.921139273148704, "learning_rate": 4.952783196790308e-08, "logits/chosen": -2.84375, "logits/rejected": -3.09375, "logps/chosen": -696.0, "logps/rejected": -1184.0, "loss": 0.1949, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.25, "rewards/margins": 4.96875, "rewards/rejected": -10.25, "step": 20570 }, { "epoch": 0.8169422225750749, "grad_norm": 29.302993626025803, "learning_rate": 4.932104000710913e-08, "logits/chosen": -2.78125, "logits/rejected": -2.9375, "logps/chosen": -672.0, "logps/rejected": -1176.0, "loss": 0.1867, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.0, "rewards/margins": 5.0625, "rewards/rejected": -10.0625, "step": 20580 }, { "epoch": 0.8173391818668996, "grad_norm": 20.79046358630166, "learning_rate": 4.9114633400574506e-08, "logits/chosen": -2.640625, "logits/rejected": -2.703125, "logps/chosen": -684.0, "logps/rejected": -1200.0, "loss": 0.2391, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 20590 }, { "epoch": 0.8177361411587242, "grad_norm": 35.961565577880634, "learning_rate": 4.890861254465206e-08, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -680.0, "logps/rejected": -1256.0, "loss": 0.2188, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.71875, "rewards/rejected": -10.75, "step": 20600 }, { "epoch": 0.8181331004505488, "grad_norm": 25.23553983554409, "learning_rate": 4.870297783495389e-08, "logits/chosen": -2.765625, "logits/rejected": -3.0, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.2007, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.375, "rewards/rejected": -10.3125, "step": 20610 }, { "epoch": 0.8185300597423735, "grad_norm": 27.119381083322857, "learning_rate": 4.849772966635046e-08, "logits/chosen": -2.921875, "logits/rejected": -3.28125, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.2348, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 4.875, "rewards/rejected": -9.875, "step": 20620 }, { "epoch": 0.8189270190341981, "grad_norm": 29.86802040315286, "learning_rate": 4.829286843297012e-08, "logits/chosen": -2.6875, "logits/rejected": -3.109375, "logps/chosen": -640.0, "logps/rejected": -1168.0, "loss": 0.2074, "rewards/accuracies": 0.96875, "rewards/chosen": -4.625, "rewards/margins": 5.53125, "rewards/rejected": -10.125, "step": 20630 }, { "epoch": 0.8193239783260227, "grad_norm": 14.734659215364376, "learning_rate": 4.8088394528198046e-08, "logits/chosen": -2.65625, "logits/rejected": -2.828125, "logps/chosen": -712.0, "logps/rejected": -1216.0, "loss": 0.2139, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.1875, "rewards/margins": 5.3125, "rewards/rejected": -10.5, "step": 20640 }, { "epoch": 0.8197209376178473, "grad_norm": 29.73725853844836, "learning_rate": 4.7884308344675884e-08, "logits/chosen": -2.671875, "logits/rejected": -2.953125, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2377, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.15625, "rewards/rejected": -10.0625, "step": 20650 }, { "epoch": 0.820117896909672, "grad_norm": 21.213862698449542, "learning_rate": 4.768061027430048e-08, "logits/chosen": -2.703125, "logits/rejected": -2.828125, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.1947, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 5.25, "rewards/rejected": -10.1875, "step": 20660 }, { "epoch": 0.8205148562014966, "grad_norm": 29.868011892267916, "learning_rate": 4.74773007082237e-08, "logits/chosen": -2.5625, "logits/rejected": -2.90625, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.2148, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 20670 }, { "epoch": 0.8209118154933212, "grad_norm": 17.66137250740349, "learning_rate": 4.7274380036851167e-08, "logits/chosen": -2.8125, "logits/rejected": -3.125, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.2097, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 4.9375, "rewards/rejected": -10.0, "step": 20680 }, { "epoch": 0.8213087747851457, "grad_norm": 18.714916582361003, "learning_rate": 4.70718486498417e-08, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.198, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.125, "rewards/margins": 5.03125, "rewards/rejected": -10.125, "step": 20690 }, { "epoch": 0.8217057340769705, "grad_norm": 40.16142159015803, "learning_rate": 4.686970693610687e-08, "logits/chosen": -2.71875, "logits/rejected": -2.8125, "logps/chosen": -692.0, "logps/rejected": -1224.0, "loss": 0.2613, "rewards/accuracies": 0.96875, "rewards/chosen": -5.1875, "rewards/margins": 5.28125, "rewards/rejected": -10.5, "step": 20700 }, { "epoch": 0.822102693368795, "grad_norm": 36.61328603978284, "learning_rate": 4.666795528380959e-08, "logits/chosen": -2.78125, "logits/rejected": -3.046875, "logps/chosen": -680.0, "logps/rejected": -1224.0, "loss": 0.1929, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.21875, "rewards/margins": 5.25, "rewards/rejected": -10.4375, "step": 20710 }, { "epoch": 0.8224996526606196, "grad_norm": 21.688362017509725, "learning_rate": 4.646659408036413e-08, "logits/chosen": -2.75, "logits/rejected": -2.96875, "logps/chosen": -652.0, "logps/rejected": -1200.0, "loss": 0.2087, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.59375, "rewards/rejected": -10.375, "step": 20720 }, { "epoch": 0.8228966119524442, "grad_norm": 18.595342472461432, "learning_rate": 4.626562371243473e-08, "logits/chosen": -2.734375, "logits/rejected": -3.109375, "logps/chosen": -688.0, "logps/rejected": -1168.0, "loss": 0.2131, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.96875, "rewards/rejected": -10.125, "step": 20730 }, { "epoch": 0.8232935712442689, "grad_norm": 34.421619345773564, "learning_rate": 4.6065044565935267e-08, "logits/chosen": -2.796875, "logits/rejected": -2.96875, "logps/chosen": -668.0, "logps/rejected": -1160.0, "loss": 0.2032, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 4.96875, "rewards/rejected": -9.9375, "step": 20740 }, { "epoch": 0.8236905305360935, "grad_norm": 16.9567553845892, "learning_rate": 4.586485702602838e-08, "logits/chosen": -2.65625, "logits/rejected": -2.8125, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.182, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.875, "rewards/margins": 5.3125, "rewards/rejected": -10.1875, "step": 20750 }, { "epoch": 0.8240874898279181, "grad_norm": 27.070000350073016, "learning_rate": 4.566506147712451e-08, "logits/chosen": -2.71875, "logits/rejected": -2.875, "logps/chosen": -676.0, "logps/rejected": -1176.0, "loss": 0.2069, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.9375, "rewards/rejected": -10.0, "step": 20760 }, { "epoch": 0.8244844491197427, "grad_norm": 13.603806258564763, "learning_rate": 4.546565830288174e-08, "logits/chosen": -2.6875, "logits/rejected": -3.046875, "logps/chosen": -700.0, "logps/rejected": -1152.0, "loss": 0.2361, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 4.6875, "rewards/rejected": -9.875, "step": 20770 }, { "epoch": 0.8248814084115674, "grad_norm": 23.708146600855876, "learning_rate": 4.526664788620435e-08, "logits/chosen": -2.640625, "logits/rejected": -2.78125, "logps/chosen": -700.0, "logps/rejected": -1256.0, "loss": 0.197, "rewards/accuracies": 0.9375, "rewards/chosen": -5.25, "rewards/margins": 5.34375, "rewards/rejected": -10.625, "step": 20780 }, { "epoch": 0.825278367703392, "grad_norm": 21.170303440110455, "learning_rate": 4.506803060924269e-08, "logits/chosen": -2.734375, "logits/rejected": -3.140625, "logps/chosen": -668.0, "logps/rejected": -1160.0, "loss": 0.1833, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.96875, "rewards/rejected": -10.0, "step": 20790 }, { "epoch": 0.8256753269952166, "grad_norm": 25.785142503297077, "learning_rate": 4.4869806853392036e-08, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.1832, "rewards/accuracies": 0.96875, "rewards/chosen": -4.9375, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 20800 }, { "epoch": 0.8260722862870412, "grad_norm": 22.152733477207523, "learning_rate": 4.467197699929193e-08, "logits/chosen": -2.703125, "logits/rejected": -2.984375, "logps/chosen": -668.0, "logps/rejected": -1184.0, "loss": 0.188, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 5.25, "rewards/rejected": -10.3125, "step": 20810 }, { "epoch": 0.8264692455788659, "grad_norm": 33.36847118239904, "learning_rate": 4.44745414268258e-08, "logits/chosen": -2.734375, "logits/rejected": -2.984375, "logps/chosen": -708.0, "logps/rejected": -1200.0, "loss": 0.2183, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.25, "rewards/margins": 5.03125, "rewards/rejected": -10.25, "step": 20820 }, { "epoch": 0.8268662048706905, "grad_norm": 29.06376684697212, "learning_rate": 4.4277500515119607e-08, "logits/chosen": -2.90625, "logits/rejected": -2.921875, "logps/chosen": -684.0, "logps/rejected": -1176.0, "loss": 0.2354, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.21875, "rewards/margins": 4.84375, "rewards/rejected": -10.0625, "step": 20830 }, { "epoch": 0.8272631641625151, "grad_norm": 37.6423406889198, "learning_rate": 4.4080854642541826e-08, "logits/chosen": -2.71875, "logits/rejected": -2.796875, "logps/chosen": -700.0, "logps/rejected": -1224.0, "loss": 0.2502, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.15625, "rewards/margins": 5.28125, "rewards/rejected": -10.4375, "step": 20840 }, { "epoch": 0.8276601234543397, "grad_norm": 24.318853647787595, "learning_rate": 4.388460418670206e-08, "logits/chosen": -2.703125, "logits/rejected": -2.90625, "logps/chosen": -696.0, "logps/rejected": -1224.0, "loss": 0.1968, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.1875, "rewards/margins": 5.40625, "rewards/rejected": -10.5625, "step": 20850 }, { "epoch": 0.8280570827461644, "grad_norm": 30.456500927988685, "learning_rate": 4.368874952445068e-08, "logits/chosen": -2.734375, "logits/rejected": -2.890625, "logps/chosen": -656.0, "logps/rejected": -1208.0, "loss": 0.2205, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.625, "rewards/rejected": -10.4375, "step": 20860 }, { "epoch": 0.828454042037989, "grad_norm": 28.32171428442756, "learning_rate": 4.349329103187815e-08, "logits/chosen": -2.765625, "logits/rejected": -3.09375, "logps/chosen": -672.0, "logps/rejected": -1200.0, "loss": 0.1957, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 5.28125, "rewards/rejected": -10.3125, "step": 20870 }, { "epoch": 0.8288510013298136, "grad_norm": 24.04166748603074, "learning_rate": 4.3298229084314075e-08, "logits/chosen": -2.84375, "logits/rejected": -3.078125, "logps/chosen": -696.0, "logps/rejected": -1160.0, "loss": 0.2116, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.90625, "rewards/rejected": -10.0, "step": 20880 }, { "epoch": 0.8292479606216383, "grad_norm": 26.396006997646705, "learning_rate": 4.310356405632673e-08, "logits/chosen": -2.71875, "logits/rejected": -2.984375, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.2418, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 5.53125, "rewards/rejected": -10.4375, "step": 20890 }, { "epoch": 0.8296449199134629, "grad_norm": 14.654112964912342, "learning_rate": 4.290929632172199e-08, "logits/chosen": -2.71875, "logits/rejected": -2.796875, "logps/chosen": -700.0, "logps/rejected": -1240.0, "loss": 0.2086, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.125, "rewards/margins": 5.375, "rewards/rejected": -10.5, "step": 20900 }, { "epoch": 0.8300418792052875, "grad_norm": 30.338426956540903, "learning_rate": 4.271542625354302e-08, "logits/chosen": -2.671875, "logits/rejected": -3.0625, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2132, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.375, "rewards/rejected": -10.125, "step": 20910 }, { "epoch": 0.8304388384971121, "grad_norm": 32.81211259556942, "learning_rate": 4.252195422406912e-08, "logits/chosen": -2.765625, "logits/rejected": -2.875, "logps/chosen": -696.0, "logps/rejected": -1200.0, "loss": 0.2282, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.34375, "rewards/margins": 4.90625, "rewards/rejected": -10.25, "step": 20920 }, { "epoch": 0.8308357977889368, "grad_norm": 31.06129797641686, "learning_rate": 4.232888060481568e-08, "logits/chosen": -2.8125, "logits/rejected": -3.0, "logps/chosen": -704.0, "logps/rejected": -1192.0, "loss": 0.2412, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.375, "rewards/margins": 4.90625, "rewards/rejected": -10.3125, "step": 20930 }, { "epoch": 0.8312327570807614, "grad_norm": 28.1842011274624, "learning_rate": 4.213620576653265e-08, "logits/chosen": -2.78125, "logits/rejected": -2.734375, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.1869, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 5.25, "rewards/rejected": -10.4375, "step": 20940 }, { "epoch": 0.831629716372586, "grad_norm": 43.21522301843219, "learning_rate": 4.194393007920438e-08, "logits/chosen": -2.671875, "logits/rejected": -2.921875, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.205, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.0, "rewards/margins": 5.46875, "rewards/rejected": -10.4375, "step": 20950 }, { "epoch": 0.8320266756644106, "grad_norm": 21.932482119542925, "learning_rate": 4.175205391204864e-08, "logits/chosen": -2.59375, "logits/rejected": -2.984375, "logps/chosen": -688.0, "logps/rejected": -1200.0, "loss": 0.1918, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.21875, "rewards/margins": 5.1875, "rewards/rejected": -10.4375, "step": 20960 }, { "epoch": 0.8324236349562353, "grad_norm": 18.700267169336975, "learning_rate": 4.156057763351617e-08, "logits/chosen": -2.671875, "logits/rejected": -3.0, "logps/chosen": -692.0, "logps/rejected": -1216.0, "loss": 0.2305, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.15625, "rewards/margins": 5.40625, "rewards/rejected": -10.5625, "step": 20970 }, { "epoch": 0.8328205942480599, "grad_norm": 26.792750756843965, "learning_rate": 4.1369501611289856e-08, "logits/chosen": -2.734375, "logits/rejected": -3.015625, "logps/chosen": -696.0, "logps/rejected": -1176.0, "loss": 0.1756, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.21875, "rewards/margins": 4.90625, "rewards/rejected": -10.125, "step": 20980 }, { "epoch": 0.8332175535398845, "grad_norm": 32.9950134650726, "learning_rate": 4.117882621228377e-08, "logits/chosen": -2.65625, "logits/rejected": -2.828125, "logps/chosen": -676.0, "logps/rejected": -1200.0, "loss": 0.2323, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.1875, "rewards/margins": 5.15625, "rewards/rejected": -10.3125, "step": 20990 }, { "epoch": 0.8336145128317091, "grad_norm": 30.519133459893833, "learning_rate": 4.098855180264285e-08, "logits/chosen": -2.78125, "logits/rejected": -2.984375, "logps/chosen": -656.0, "logps/rejected": -1208.0, "loss": 0.1948, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.375, "rewards/rejected": -10.25, "step": 21000 }, { "epoch": 0.8336145128317091, "eval_logits/chosen": -2.734375, "eval_logits/rejected": -2.953125, "eval_logps/chosen": -720.0, "eval_logps/rejected": -1128.0, "eval_loss": 0.2516096830368042, "eval_rewards/accuracies": 0.8948822617530823, "eval_rewards/chosen": -5.375, "eval_rewards/margins": 4.28125, "eval_rewards/rejected": -9.625, "eval_runtime": 5412.0034, "eval_samples_per_second": 32.638, "eval_steps_per_second": 0.51, "step": 21000 }, { "epoch": 0.8340114721235338, "grad_norm": 21.995562767895052, "learning_rate": 4.079867874774193e-08, "logits/chosen": -2.6875, "logits/rejected": -2.953125, "logps/chosen": -696.0, "logps/rejected": -1224.0, "loss": 0.1694, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.4375, "rewards/rejected": -10.4375, "step": 21010 }, { "epoch": 0.8344084314153584, "grad_norm": 45.421212832611424, "learning_rate": 4.0609207412185264e-08, "logits/chosen": -2.75, "logits/rejected": -2.875, "logps/chosen": -676.0, "logps/rejected": -1224.0, "loss": 0.2367, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.0, "rewards/margins": 5.53125, "rewards/rejected": -10.5, "step": 21020 }, { "epoch": 0.834805390707183, "grad_norm": 26.89094671980656, "learning_rate": 4.042013815980566e-08, "logits/chosen": -2.78125, "logits/rejected": -3.0625, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.2373, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 5.125, "rewards/rejected": -10.25, "step": 21030 }, { "epoch": 0.8352023499990076, "grad_norm": 31.199934016878586, "learning_rate": 4.0231471353663836e-08, "logits/chosen": -2.671875, "logits/rejected": -2.734375, "logps/chosen": -704.0, "logps/rejected": -1192.0, "loss": 0.2279, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 5.03125, "rewards/rejected": -10.1875, "step": 21040 }, { "epoch": 0.8355993092908323, "grad_norm": 17.455924818959314, "learning_rate": 4.0043207356047606e-08, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.216, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.125, "rewards/rejected": -10.125, "step": 21050 }, { "epoch": 0.8359962685826569, "grad_norm": 20.180288948715408, "learning_rate": 3.985534652847133e-08, "logits/chosen": -2.703125, "logits/rejected": -3.15625, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.198, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 5.0, "rewards/rejected": -10.1875, "step": 21060 }, { "epoch": 0.8363932278744814, "grad_norm": 41.855891929648564, "learning_rate": 3.966788923167527e-08, "logits/chosen": -2.59375, "logits/rejected": -3.15625, "logps/chosen": -684.0, "logps/rejected": -1176.0, "loss": 0.2471, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.3125, "rewards/rejected": -10.1875, "step": 21070 }, { "epoch": 0.836790187166306, "grad_norm": 33.0042062780367, "learning_rate": 3.948083582562486e-08, "logits/chosen": -2.78125, "logits/rejected": -2.828125, "logps/chosen": -712.0, "logps/rejected": -1176.0, "loss": 0.2282, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.25, "rewards/margins": 4.75, "rewards/rejected": -10.0, "step": 21080 }, { "epoch": 0.8371871464581307, "grad_norm": 30.03115881477337, "learning_rate": 3.929418666950973e-08, "logits/chosen": -2.59375, "logits/rejected": -2.90625, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.2187, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 21090 }, { "epoch": 0.8375841057499553, "grad_norm": 28.927530731739974, "learning_rate": 3.910794212174337e-08, "logits/chosen": -2.625, "logits/rejected": -2.78125, "logps/chosen": -652.0, "logps/rejected": -1136.0, "loss": 0.1944, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.0625, "rewards/rejected": -9.875, "step": 21100 }, { "epoch": 0.8379810650417799, "grad_norm": 29.800677143424526, "learning_rate": 3.892210253996242e-08, "logits/chosen": -2.453125, "logits/rejected": -2.953125, "logps/chosen": -676.0, "logps/rejected": -1176.0, "loss": 0.238, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 21110 }, { "epoch": 0.8383780243336046, "grad_norm": 20.892107257648405, "learning_rate": 3.873666828102568e-08, "logits/chosen": -2.703125, "logits/rejected": -2.9375, "logps/chosen": -704.0, "logps/rejected": -1192.0, "loss": 0.2016, "rewards/accuracies": 0.96875, "rewards/chosen": -5.1875, "rewards/margins": 4.9375, "rewards/rejected": -10.125, "step": 21120 }, { "epoch": 0.8387749836254292, "grad_norm": 24.289645034365524, "learning_rate": 3.85516397010138e-08, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -656.0, "logps/rejected": -1152.0, "loss": 0.2066, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 21130 }, { "epoch": 0.8391719429172538, "grad_norm": 26.265285686748722, "learning_rate": 3.836701715522839e-08, "logits/chosen": -2.65625, "logits/rejected": -2.828125, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.2143, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 5.03125, "rewards/rejected": -10.0, "step": 21140 }, { "epoch": 0.8395689022090784, "grad_norm": 31.819308109286084, "learning_rate": 3.818280099819121e-08, "logits/chosen": -2.6875, "logits/rejected": -2.921875, "logps/chosen": -620.0, "logps/rejected": -1160.0, "loss": 0.2014, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.65625, "rewards/margins": 5.25, "rewards/rejected": -9.875, "step": 21150 }, { "epoch": 0.8399658615009031, "grad_norm": 34.07307542130277, "learning_rate": 3.799899158364392e-08, "logits/chosen": -2.59375, "logits/rejected": -2.78125, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2024, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 21160 }, { "epoch": 0.8403628207927277, "grad_norm": 21.36595358268203, "learning_rate": 3.7815589264546975e-08, "logits/chosen": -2.515625, "logits/rejected": -2.71875, "logps/chosen": -688.0, "logps/rejected": -1208.0, "loss": 0.2036, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.34375, "rewards/rejected": -10.3125, "step": 21170 }, { "epoch": 0.8407597800845523, "grad_norm": 34.85712037983203, "learning_rate": 3.7632594393079006e-08, "logits/chosen": -2.71875, "logits/rejected": -2.875, "logps/chosen": -676.0, "logps/rejected": -1184.0, "loss": 0.2281, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 21180 }, { "epoch": 0.8411567393763769, "grad_norm": 29.375874970187017, "learning_rate": 3.7450007320636494e-08, "logits/chosen": -2.734375, "logits/rejected": -2.9375, "logps/chosen": -644.0, "logps/rejected": -1168.0, "loss": 0.2117, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.25, "rewards/rejected": -10.0, "step": 21190 }, { "epoch": 0.8415536986682016, "grad_norm": 24.52116879371863, "learning_rate": 3.72678283978326e-08, "logits/chosen": -2.578125, "logits/rejected": -2.9375, "logps/chosen": -644.0, "logps/rejected": -1184.0, "loss": 0.2304, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.5, "rewards/rejected": -10.1875, "step": 21200 }, { "epoch": 0.8419506579600262, "grad_norm": 9.41853557609498, "learning_rate": 3.708605797449696e-08, "logits/chosen": -2.671875, "logits/rejected": -2.84375, "logps/chosen": -652.0, "logps/rejected": -1168.0, "loss": 0.1913, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.28125, "rewards/rejected": -10.1875, "step": 21210 }, { "epoch": 0.8423476172518508, "grad_norm": 24.37302493531755, "learning_rate": 3.6904696399674615e-08, "logits/chosen": -2.8125, "logits/rejected": -3.015625, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.1702, "rewards/accuracies": 0.96875, "rewards/chosen": -5.03125, "rewards/margins": 5.1875, "rewards/rejected": -10.25, "step": 21220 }, { "epoch": 0.8427445765436754, "grad_norm": 39.28600888390209, "learning_rate": 3.672374402162548e-08, "logits/chosen": -2.78125, "logits/rejected": -2.9375, "logps/chosen": -692.0, "logps/rejected": -1200.0, "loss": 0.1932, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.09375, "rewards/rejected": -10.1875, "step": 21230 }, { "epoch": 0.8431415358355001, "grad_norm": 30.17176728087822, "learning_rate": 3.6543201187823944e-08, "logits/chosen": -2.6875, "logits/rejected": -3.0625, "logps/chosen": -668.0, "logps/rejected": -1152.0, "loss": 0.2128, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 21240 }, { "epoch": 0.8435384951273247, "grad_norm": 29.182246308635793, "learning_rate": 3.636306824495769e-08, "logits/chosen": -2.640625, "logits/rejected": -2.8125, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2127, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 21250 }, { "epoch": 0.8439354544191493, "grad_norm": 29.25595951877557, "learning_rate": 3.618334553892757e-08, "logits/chosen": -2.734375, "logits/rejected": -2.8125, "logps/chosen": -684.0, "logps/rejected": -1184.0, "loss": 0.167, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.15625, "rewards/margins": 4.96875, "rewards/rejected": -10.125, "step": 21260 }, { "epoch": 0.8443324137109739, "grad_norm": 37.048067561110166, "learning_rate": 3.600403341484648e-08, "logits/chosen": -2.671875, "logits/rejected": -3.046875, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.2407, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.125, "rewards/margins": 5.1875, "rewards/rejected": -10.3125, "step": 21270 }, { "epoch": 0.8447293730027986, "grad_norm": 15.368177202091502, "learning_rate": 3.582513221703889e-08, "logits/chosen": -2.53125, "logits/rejected": -2.796875, "logps/chosen": -684.0, "logps/rejected": -1208.0, "loss": 0.231, "rewards/accuracies": 0.9375, "rewards/chosen": -5.09375, "rewards/margins": 5.40625, "rewards/rejected": -10.5, "step": 21280 }, { "epoch": 0.8451263322946232, "grad_norm": 27.24719345486318, "learning_rate": 3.564664228904035e-08, "logits/chosen": -2.8125, "logits/rejected": -2.984375, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.1968, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 5.03125, "rewards/rejected": -10.125, "step": 21290 }, { "epoch": 0.8455232915864478, "grad_norm": 22.605257507648357, "learning_rate": 3.54685639735966e-08, "logits/chosen": -2.765625, "logits/rejected": -3.140625, "logps/chosen": -688.0, "logps/rejected": -1216.0, "loss": 0.2052, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 5.4375, "rewards/rejected": -10.5625, "step": 21300 }, { "epoch": 0.8459202508782724, "grad_norm": 33.42543840517224, "learning_rate": 3.529089761266293e-08, "logits/chosen": -2.6875, "logits/rejected": -3.078125, "logps/chosen": -700.0, "logps/rejected": -1168.0, "loss": 0.2083, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.25, "rewards/margins": 4.9375, "rewards/rejected": -10.1875, "step": 21310 }, { "epoch": 0.8463172101700971, "grad_norm": 37.42732437520986, "learning_rate": 3.511364354740357e-08, "logits/chosen": -2.828125, "logits/rejected": -2.953125, "logps/chosen": -652.0, "logps/rejected": -1240.0, "loss": 0.1885, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.6875, "rewards/rejected": -10.625, "step": 21320 }, { "epoch": 0.8467141694619217, "grad_norm": 18.563436269295217, "learning_rate": 3.493680211819103e-08, "logits/chosen": -2.640625, "logits/rejected": -2.828125, "logps/chosen": -644.0, "logps/rejected": -1200.0, "loss": 0.1903, "rewards/accuracies": 0.96875, "rewards/chosen": -4.6875, "rewards/margins": 5.625, "rewards/rejected": -10.3125, "step": 21330 }, { "epoch": 0.8471111287537463, "grad_norm": 24.977249989332154, "learning_rate": 3.476037366460552e-08, "logits/chosen": -2.71875, "logits/rejected": -3.0, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.1921, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.125, "rewards/margins": 4.875, "rewards/rejected": -10.0, "step": 21340 }, { "epoch": 0.8475080880455709, "grad_norm": 23.40111966418473, "learning_rate": 3.458435852543431e-08, "logits/chosen": -2.59375, "logits/rejected": -2.90625, "logps/chosen": -640.0, "logps/rejected": -1192.0, "loss": 0.1745, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.78125, "rewards/margins": 5.4375, "rewards/rejected": -10.1875, "step": 21350 }, { "epoch": 0.8479050473373956, "grad_norm": 33.5658275932938, "learning_rate": 3.44087570386708e-08, "logits/chosen": -2.75, "logits/rejected": -3.234375, "logps/chosen": -644.0, "logps/rejected": -1160.0, "loss": 0.2259, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 5.21875, "rewards/rejected": -10.125, "step": 21360 }, { "epoch": 0.8483020066292202, "grad_norm": 18.576638716222654, "learning_rate": 3.423356954151421e-08, "logits/chosen": -2.765625, "logits/rejected": -2.734375, "logps/chosen": -672.0, "logps/rejected": -1208.0, "loss": 0.2218, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 5.21875, "rewards/rejected": -10.25, "step": 21370 }, { "epoch": 0.8486989659210448, "grad_norm": 23.217822023593857, "learning_rate": 3.405879637036868e-08, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -644.0, "logps/rejected": -1184.0, "loss": 0.202, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.3125, "rewards/rejected": -10.1875, "step": 21380 }, { "epoch": 0.8490959252128695, "grad_norm": 50.156299957657886, "learning_rate": 3.388443786084286e-08, "logits/chosen": -2.828125, "logits/rejected": -2.734375, "logps/chosen": -700.0, "logps/rejected": -1200.0, "loss": 0.2136, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 5.03125, "rewards/rejected": -10.3125, "step": 21390 }, { "epoch": 0.8494928845046941, "grad_norm": 28.70420599191098, "learning_rate": 3.371049434774914e-08, "logits/chosen": -2.6875, "logits/rejected": -2.953125, "logps/chosen": -692.0, "logps/rejected": -1184.0, "loss": 0.2041, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.09375, "rewards/rejected": -10.1875, "step": 21400 }, { "epoch": 0.8498898437965187, "grad_norm": 31.64726066027249, "learning_rate": 3.353696616510299e-08, "logits/chosen": -2.75, "logits/rejected": -2.953125, "logps/chosen": -668.0, "logps/rejected": -1216.0, "loss": 0.2126, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 5.4375, "rewards/rejected": -10.375, "step": 21410 }, { "epoch": 0.8502868030883433, "grad_norm": 22.23204996097477, "learning_rate": 3.3363853646122234e-08, "logits/chosen": -2.578125, "logits/rejected": -2.71875, "logps/chosen": -676.0, "logps/rejected": -1280.0, "loss": 0.1879, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.84375, "rewards/margins": 6.0625, "rewards/rejected": -10.875, "step": 21420 }, { "epoch": 0.850683762380168, "grad_norm": 25.991073101708828, "learning_rate": 3.319115712322657e-08, "logits/chosen": -2.609375, "logits/rejected": -2.796875, "logps/chosen": -672.0, "logps/rejected": -1136.0, "loss": 0.2522, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 4.78125, "rewards/rejected": -9.75, "step": 21430 }, { "epoch": 0.8510807216719926, "grad_norm": 30.089416788887164, "learning_rate": 3.301887692803709e-08, "logits/chosen": -2.578125, "logits/rejected": -2.765625, "logps/chosen": -688.0, "logps/rejected": -1216.0, "loss": 0.1789, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.4375, "rewards/rejected": -10.375, "step": 21440 }, { "epoch": 0.8514776809638172, "grad_norm": 23.707372422544093, "learning_rate": 3.2847013391375164e-08, "logits/chosen": -2.515625, "logits/rejected": -2.984375, "logps/chosen": -676.0, "logps/rejected": -1208.0, "loss": 0.1733, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.53125, "rewards/rejected": -10.4375, "step": 21450 }, { "epoch": 0.8518746402556417, "grad_norm": 17.891829601593532, "learning_rate": 3.267556684326217e-08, "logits/chosen": -2.625, "logits/rejected": -2.8125, "logps/chosen": -696.0, "logps/rejected": -1192.0, "loss": 0.1982, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.0625, "rewards/margins": 5.0, "rewards/rejected": -10.0625, "step": 21460 }, { "epoch": 0.8522715995474665, "grad_norm": 27.677455717440836, "learning_rate": 3.2504537612918686e-08, "logits/chosen": -2.796875, "logits/rejected": -2.96875, "logps/chosen": -688.0, "logps/rejected": -1192.0, "loss": 0.2133, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 5.0, "rewards/rejected": -10.1875, "step": 21470 }, { "epoch": 0.852668558839291, "grad_norm": 17.515497176259757, "learning_rate": 3.2333926028764114e-08, "logits/chosen": -2.71875, "logits/rejected": -2.796875, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.1987, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 5.125, "rewards/rejected": -10.125, "step": 21480 }, { "epoch": 0.8530655181311156, "grad_norm": 27.278302482450883, "learning_rate": 3.2163732418415664e-08, "logits/chosen": -2.6875, "logits/rejected": -3.0, "logps/chosen": -684.0, "logps/rejected": -1160.0, "loss": 0.1985, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.28125, "rewards/margins": 4.78125, "rewards/rejected": -10.0625, "step": 21490 }, { "epoch": 0.8534624774229402, "grad_norm": 22.458155178476357, "learning_rate": 3.199395710868813e-08, "logits/chosen": -2.640625, "logits/rejected": -2.75, "logps/chosen": -640.0, "logps/rejected": -1208.0, "loss": 0.2505, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.59375, "rewards/rejected": -10.3125, "step": 21500 }, { "epoch": 0.8538594367147649, "grad_norm": 23.720855173976144, "learning_rate": 3.182460042559293e-08, "logits/chosen": -2.734375, "logits/rejected": -2.953125, "logps/chosen": -724.0, "logps/rejected": -1208.0, "loss": 0.2024, "rewards/accuracies": 0.9375, "rewards/chosen": -5.375, "rewards/margins": 4.90625, "rewards/rejected": -10.25, "step": 21510 }, { "epoch": 0.8542563960065895, "grad_norm": 23.975445593884206, "learning_rate": 3.165566269433756e-08, "logits/chosen": -2.65625, "logits/rejected": -2.828125, "logps/chosen": -656.0, "logps/rejected": -1192.0, "loss": 0.2156, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 21520 }, { "epoch": 0.8546533552984141, "grad_norm": 35.645278974189694, "learning_rate": 3.148714423932525e-08, "logits/chosen": -2.828125, "logits/rejected": -3.0, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2489, "rewards/accuracies": 0.96875, "rewards/chosen": -4.9375, "rewards/margins": 5.03125, "rewards/rejected": -10.0, "step": 21530 }, { "epoch": 0.8550503145902387, "grad_norm": 21.48921765228955, "learning_rate": 3.1319045384153835e-08, "logits/chosen": -2.703125, "logits/rejected": -2.921875, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.2005, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.75, "rewards/margins": 5.1875, "rewards/rejected": -9.9375, "step": 21540 }, { "epoch": 0.8554472738820634, "grad_norm": 23.894757185492335, "learning_rate": 3.1151366451615676e-08, "logits/chosen": -2.8125, "logits/rejected": -3.140625, "logps/chosen": -652.0, "logps/rejected": -1200.0, "loss": 0.1722, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.53125, "rewards/rejected": -10.4375, "step": 21550 }, { "epoch": 0.855844233173888, "grad_norm": 39.65696941375758, "learning_rate": 3.098410776369656e-08, "logits/chosen": -2.59375, "logits/rejected": -2.90625, "logps/chosen": -696.0, "logps/rejected": -1232.0, "loss": 0.2345, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.4375, "rewards/rejected": -10.5625, "step": 21560 }, { "epoch": 0.8562411924657126, "grad_norm": 31.94520018466084, "learning_rate": 3.0817269641575385e-08, "logits/chosen": -2.875, "logits/rejected": -3.046875, "logps/chosen": -668.0, "logps/rejected": -1128.0, "loss": 0.2211, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 4.6875, "rewards/rejected": -9.75, "step": 21570 }, { "epoch": 0.8566381517575372, "grad_norm": 20.217800537285637, "learning_rate": 3.065085240562351e-08, "logits/chosen": -2.71875, "logits/rejected": -3.09375, "logps/chosen": -676.0, "logps/rejected": -1176.0, "loss": 0.2217, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 21580 }, { "epoch": 0.8570351110493619, "grad_norm": 31.773467145786743, "learning_rate": 3.048485637540399e-08, "logits/chosen": -2.671875, "logits/rejected": -3.09375, "logps/chosen": -676.0, "logps/rejected": -1184.0, "loss": 0.1862, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.34375, "rewards/rejected": -10.3125, "step": 21590 }, { "epoch": 0.8574320703411865, "grad_norm": 39.439352643431796, "learning_rate": 3.031928186967117e-08, "logits/chosen": -2.6875, "logits/rejected": -3.0, "logps/chosen": -680.0, "logps/rejected": -1224.0, "loss": 0.2114, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 5.65625, "rewards/rejected": -10.6875, "step": 21600 }, { "epoch": 0.8578290296330111, "grad_norm": 30.180251662559304, "learning_rate": 3.015412920636981e-08, "logits/chosen": -2.6875, "logits/rejected": -2.875, "logps/chosen": -672.0, "logps/rejected": -1216.0, "loss": 0.2139, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0, "rewards/margins": 5.46875, "rewards/rejected": -10.4375, "step": 21610 }, { "epoch": 0.8582259889248357, "grad_norm": 27.37726778044406, "learning_rate": 2.998939870263481e-08, "logits/chosen": -2.765625, "logits/rejected": -3.0625, "logps/chosen": -692.0, "logps/rejected": -1168.0, "loss": 0.2148, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.25, "rewards/margins": 4.875, "rewards/rejected": -10.125, "step": 21620 }, { "epoch": 0.8586229482166604, "grad_norm": 28.916258594623624, "learning_rate": 2.982509067479028e-08, "logits/chosen": -2.734375, "logits/rejected": -2.8125, "logps/chosen": -664.0, "logps/rejected": -1232.0, "loss": 0.2276, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 5.59375, "rewards/rejected": -10.6875, "step": 21630 }, { "epoch": 0.859019907508485, "grad_norm": 23.554241265217478, "learning_rate": 2.966120543834902e-08, "logits/chosen": -2.703125, "logits/rejected": -2.796875, "logps/chosen": -676.0, "logps/rejected": -1248.0, "loss": 0.2182, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.53125, "rewards/rejected": -10.5, "step": 21640 }, { "epoch": 0.8594168668003096, "grad_norm": 28.761921031256193, "learning_rate": 2.9497743308012164e-08, "logits/chosen": -2.734375, "logits/rejected": -2.953125, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2207, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 21650 }, { "epoch": 0.8598138260921343, "grad_norm": 30.139628446996582, "learning_rate": 2.9334704597668207e-08, "logits/chosen": -2.609375, "logits/rejected": -2.859375, "logps/chosen": -708.0, "logps/rejected": -1208.0, "loss": 0.2353, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 5.25, "rewards/rejected": -10.3125, "step": 21660 }, { "epoch": 0.8602107853839589, "grad_norm": 24.760825199474127, "learning_rate": 2.9172089620392692e-08, "logits/chosen": -2.59375, "logits/rejected": -2.78125, "logps/chosen": -652.0, "logps/rejected": -1160.0, "loss": 0.199, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.0, "rewards/rejected": -9.9375, "step": 21670 }, { "epoch": 0.8606077446757835, "grad_norm": 23.55916885741168, "learning_rate": 2.900989868844736e-08, "logits/chosen": -2.59375, "logits/rejected": -3.09375, "logps/chosen": -636.0, "logps/rejected": -1144.0, "loss": 0.2286, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.28125, "rewards/rejected": -10.0625, "step": 21680 }, { "epoch": 0.8610047039676081, "grad_norm": 24.79780328659332, "learning_rate": 2.8848132113279715e-08, "logits/chosen": -2.609375, "logits/rejected": -2.84375, "logps/chosen": -672.0, "logps/rejected": -1208.0, "loss": 0.2155, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.65625, "rewards/rejected": -10.375, "step": 21690 }, { "epoch": 0.8614016632594328, "grad_norm": 27.00887805058239, "learning_rate": 2.8686790205522532e-08, "logits/chosen": -2.65625, "logits/rejected": -2.703125, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.1938, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.09375, "rewards/rejected": -10.0625, "step": 21700 }, { "epoch": 0.8617986225512574, "grad_norm": 36.57818643203237, "learning_rate": 2.852587327499287e-08, "logits/chosen": -2.625, "logits/rejected": -2.84375, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.202, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.4375, "rewards/rejected": -10.1875, "step": 21710 }, { "epoch": 0.862195581843082, "grad_norm": 26.466656908863722, "learning_rate": 2.8365381630691966e-08, "logits/chosen": -2.609375, "logits/rejected": -2.84375, "logps/chosen": -680.0, "logps/rejected": -1184.0, "loss": 0.2146, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.125, "rewards/rejected": -10.0625, "step": 21720 }, { "epoch": 0.8625925411349066, "grad_norm": 20.90380335815127, "learning_rate": 2.8205315580804217e-08, "logits/chosen": -2.6875, "logits/rejected": -2.90625, "logps/chosen": -688.0, "logps/rejected": -1144.0, "loss": 0.2146, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.1875, "rewards/margins": 4.65625, "rewards/rejected": -9.8125, "step": 21730 }, { "epoch": 0.8629895004267313, "grad_norm": 24.43969743714269, "learning_rate": 2.8045675432696895e-08, "logits/chosen": -2.6875, "logits/rejected": -2.8125, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.2146, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.40625, "rewards/rejected": -10.25, "step": 21740 }, { "epoch": 0.8633864597185559, "grad_norm": 17.55325938018986, "learning_rate": 2.788646149291926e-08, "logits/chosen": -2.640625, "logits/rejected": -2.828125, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.1944, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.40625, "rewards/rejected": -10.25, "step": 21750 }, { "epoch": 0.8637834190103805, "grad_norm": 27.771928028649853, "learning_rate": 2.772767406720239e-08, "logits/chosen": -2.828125, "logits/rejected": -3.046875, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.1666, "rewards/accuracies": 0.96875, "rewards/chosen": -5.1875, "rewards/margins": 5.0625, "rewards/rejected": -10.25, "step": 21760 }, { "epoch": 0.8641803783022051, "grad_norm": 31.010795727789514, "learning_rate": 2.7569313460458238e-08, "logits/chosen": -2.8125, "logits/rejected": -3.0625, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2231, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 5.21875, "rewards/rejected": -10.1875, "step": 21770 }, { "epoch": 0.8645773375940298, "grad_norm": 27.646922772362817, "learning_rate": 2.741137997677906e-08, "logits/chosen": -2.65625, "logits/rejected": -3.015625, "logps/chosen": -652.0, "logps/rejected": -1112.0, "loss": 0.2396, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 4.6875, "rewards/rejected": -9.5625, "step": 21780 }, { "epoch": 0.8649742968858544, "grad_norm": 38.63027283974077, "learning_rate": 2.725387391943704e-08, "logits/chosen": -2.671875, "logits/rejected": -2.890625, "logps/chosen": -656.0, "logps/rejected": -1216.0, "loss": 0.2089, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.6875, "rewards/rejected": -10.4375, "step": 21790 }, { "epoch": 0.865371256177679, "grad_norm": 23.44246376851095, "learning_rate": 2.7096795590883542e-08, "logits/chosen": -2.796875, "logits/rejected": -3.0625, "logps/chosen": -664.0, "logps/rejected": -1176.0, "loss": 0.1887, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 21800 }, { "epoch": 0.8657682154695036, "grad_norm": 14.7230251278596, "learning_rate": 2.6940145292748683e-08, "logits/chosen": -2.640625, "logits/rejected": -2.78125, "logps/chosen": -676.0, "logps/rejected": -1200.0, "loss": 0.2217, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.4375, "rewards/rejected": -10.4375, "step": 21810 }, { "epoch": 0.8661651747613283, "grad_norm": 20.769271479719823, "learning_rate": 2.6783923325840564e-08, "logits/chosen": -2.671875, "logits/rejected": -3.0, "logps/chosen": -652.0, "logps/rejected": -1208.0, "loss": 0.185, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 5.6875, "rewards/rejected": -10.375, "step": 21820 }, { "epoch": 0.8665621340531529, "grad_norm": 32.661442634109996, "learning_rate": 2.6628129990144742e-08, "logits/chosen": -2.6875, "logits/rejected": -3.15625, "logps/chosen": -688.0, "logps/rejected": -1168.0, "loss": 0.2346, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.125, "rewards/margins": 5.0625, "rewards/rejected": -10.1875, "step": 21830 }, { "epoch": 0.8669590933449774, "grad_norm": 30.862834387174004, "learning_rate": 2.6472765584823727e-08, "logits/chosen": -2.65625, "logits/rejected": -2.875, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.1979, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.4375, "rewards/rejected": -10.4375, "step": 21840 }, { "epoch": 0.867356052636802, "grad_norm": 25.739954075584258, "learning_rate": 2.6317830408216418e-08, "logits/chosen": -2.6875, "logits/rejected": -3.015625, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.1873, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.75, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 21850 }, { "epoch": 0.8677530119286267, "grad_norm": 40.10380466956504, "learning_rate": 2.616332475783753e-08, "logits/chosen": -2.609375, "logits/rejected": -2.65625, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2232, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 21860 }, { "epoch": 0.8681499712204513, "grad_norm": 21.85681063461056, "learning_rate": 2.6009248930376898e-08, "logits/chosen": -2.703125, "logits/rejected": -2.921875, "logps/chosen": -688.0, "logps/rejected": -1168.0, "loss": 0.2198, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.09375, "rewards/margins": 4.84375, "rewards/rejected": -9.9375, "step": 21870 }, { "epoch": 0.8685469305122759, "grad_norm": 28.779443259809263, "learning_rate": 2.5855603221698942e-08, "logits/chosen": -2.671875, "logits/rejected": -2.8125, "logps/chosen": -660.0, "logps/rejected": -1232.0, "loss": 0.1863, "rewards/accuracies": 0.96875, "rewards/chosen": -4.9375, "rewards/margins": 5.78125, "rewards/rejected": -10.6875, "step": 21880 }, { "epoch": 0.8689438898041006, "grad_norm": 35.16263157342758, "learning_rate": 2.5702387926842206e-08, "logits/chosen": -2.5625, "logits/rejected": -2.796875, "logps/chosen": -668.0, "logps/rejected": -1232.0, "loss": 0.2057, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.59375, "rewards/rejected": -10.5, "step": 21890 }, { "epoch": 0.8693408490959252, "grad_norm": 27.517018751896483, "learning_rate": 2.5549603340018738e-08, "logits/chosen": -2.71875, "logits/rejected": -2.953125, "logps/chosen": -672.0, "logps/rejected": -1200.0, "loss": 0.2252, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.25, "rewards/rejected": -10.125, "step": 21900 }, { "epoch": 0.8697378083877498, "grad_norm": 34.672021097598474, "learning_rate": 2.5397249754613605e-08, "logits/chosen": -2.796875, "logits/rejected": -2.9375, "logps/chosen": -676.0, "logps/rejected": -1232.0, "loss": 0.2119, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.6875, "rewards/rejected": -10.6875, "step": 21910 }, { "epoch": 0.8701347676795744, "grad_norm": 25.3464154389385, "learning_rate": 2.5245327463184056e-08, "logits/chosen": -2.90625, "logits/rejected": -2.96875, "logps/chosen": -640.0, "logps/rejected": -1216.0, "loss": 0.177, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 5.5625, "rewards/rejected": -10.3125, "step": 21920 }, { "epoch": 0.8705317269713991, "grad_norm": 30.571216813239978, "learning_rate": 2.509383675745927e-08, "logits/chosen": -2.6875, "logits/rejected": -2.953125, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2516, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 21930 }, { "epoch": 0.8709286862632237, "grad_norm": 21.351334916635484, "learning_rate": 2.494277792833957e-08, "logits/chosen": -2.75, "logits/rejected": -3.015625, "logps/chosen": -664.0, "logps/rejected": -1208.0, "loss": 0.1833, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.34375, "rewards/rejected": -10.3125, "step": 21940 }, { "epoch": 0.8713256455550483, "grad_norm": 34.58537563503963, "learning_rate": 2.4792151265896122e-08, "logits/chosen": -2.703125, "logits/rejected": -2.984375, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2009, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.25, "rewards/rejected": -10.3125, "step": 21950 }, { "epoch": 0.8717226048468729, "grad_norm": 24.83760576998931, "learning_rate": 2.4641957059370167e-08, "logits/chosen": -2.65625, "logits/rejected": -2.984375, "logps/chosen": -676.0, "logps/rejected": -1200.0, "loss": 0.207, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.34375, "rewards/rejected": -10.3125, "step": 21960 }, { "epoch": 0.8721195641386976, "grad_norm": 22.520432933234094, "learning_rate": 2.4492195597172417e-08, "logits/chosen": -2.6875, "logits/rejected": -3.015625, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.2098, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.75, "rewards/margins": 5.09375, "rewards/rejected": -9.875, "step": 21970 }, { "epoch": 0.8725165234305222, "grad_norm": 30.851514942978863, "learning_rate": 2.434286716688269e-08, "logits/chosen": -2.6875, "logits/rejected": -2.953125, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.2087, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.53125, "rewards/rejected": -10.4375, "step": 21980 }, { "epoch": 0.8729134827223468, "grad_norm": 29.499786601709378, "learning_rate": 2.419397205524934e-08, "logits/chosen": -2.578125, "logits/rejected": -2.9375, "logps/chosen": -720.0, "logps/rejected": -1208.0, "loss": 0.2422, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 5.15625, "rewards/rejected": -10.3125, "step": 21990 }, { "epoch": 0.8733104420141714, "grad_norm": 23.75488871489287, "learning_rate": 2.4045510548188485e-08, "logits/chosen": -2.703125, "logits/rejected": -2.859375, "logps/chosen": -684.0, "logps/rejected": -1232.0, "loss": 0.1826, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 5.59375, "rewards/rejected": -10.5625, "step": 22000 }, { "epoch": 0.8733104420141714, "eval_logits/chosen": -2.703125, "eval_logits/rejected": -2.9375, "eval_logps/chosen": -712.0, "eval_logps/rejected": -1120.0, "eval_loss": 0.25283005833625793, "eval_rewards/accuracies": 0.8949275612831116, "eval_rewards/chosen": -5.28125, "eval_rewards/margins": 4.28125, "eval_rewards/rejected": -9.5625, "eval_runtime": 5410.345, "eval_samples_per_second": 32.648, "eval_steps_per_second": 0.51, "step": 22000 }, { "epoch": 0.8737074013059961, "grad_norm": 29.491131062876352, "learning_rate": 2.3897482930783697e-08, "logits/chosen": -2.625, "logits/rejected": -2.953125, "logps/chosen": -644.0, "logps/rejected": -1152.0, "loss": 0.2148, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.125, "rewards/rejected": -9.875, "step": 22010 }, { "epoch": 0.8741043605978207, "grad_norm": 35.11414662782172, "learning_rate": 2.374988948728543e-08, "logits/chosen": -2.59375, "logits/rejected": -2.78125, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.191, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 22020 }, { "epoch": 0.8745013198896453, "grad_norm": 28.723484117560467, "learning_rate": 2.360273050111028e-08, "logits/chosen": -2.625, "logits/rejected": -2.796875, "logps/chosen": -684.0, "logps/rejected": -1200.0, "loss": 0.1929, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.34375, "rewards/rejected": -10.1875, "step": 22030 }, { "epoch": 0.8748982791814699, "grad_norm": 29.378003114162098, "learning_rate": 2.3456006254840732e-08, "logits/chosen": -2.671875, "logits/rejected": -2.9375, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.1981, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.21875, "rewards/rejected": -10.25, "step": 22040 }, { "epoch": 0.8752952384732946, "grad_norm": 24.718914080216834, "learning_rate": 2.33097170302243e-08, "logits/chosen": -2.703125, "logits/rejected": -3.046875, "logps/chosen": -704.0, "logps/rejected": -1224.0, "loss": 0.1541, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.125, "rewards/margins": 5.375, "rewards/rejected": -10.5, "step": 22050 }, { "epoch": 0.8756921977651192, "grad_norm": 22.12758750478856, "learning_rate": 2.3163863108173225e-08, "logits/chosen": -2.71875, "logits/rejected": -2.96875, "logps/chosen": -684.0, "logps/rejected": -1216.0, "loss": 0.2105, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 5.4375, "rewards/rejected": -10.4375, "step": 22060 }, { "epoch": 0.8760891570569438, "grad_norm": 24.728563097963058, "learning_rate": 2.301844476876391e-08, "logits/chosen": -2.625, "logits/rejected": -3.046875, "logps/chosen": -640.0, "logps/rejected": -1200.0, "loss": 0.1856, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.625, "rewards/rejected": -10.3125, "step": 22070 }, { "epoch": 0.8764861163487684, "grad_norm": 30.63214787618474, "learning_rate": 2.2873462291236183e-08, "logits/chosen": -2.703125, "logits/rejected": -3.109375, "logps/chosen": -656.0, "logps/rejected": -1128.0, "loss": 0.2258, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.96875, "rewards/margins": 4.875, "rewards/rejected": -9.8125, "step": 22080 }, { "epoch": 0.8768830756405931, "grad_norm": 25.5450171440399, "learning_rate": 2.2728915953993122e-08, "logits/chosen": -2.828125, "logits/rejected": -3.21875, "logps/chosen": -644.0, "logps/rejected": -1176.0, "loss": 0.1902, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.75, "rewards/margins": 5.5, "rewards/rejected": -10.25, "step": 22090 }, { "epoch": 0.8772800349324177, "grad_norm": 26.554703818934243, "learning_rate": 2.2584806034600112e-08, "logits/chosen": -2.75, "logits/rejected": -2.859375, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.2235, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.25, "rewards/rejected": -10.125, "step": 22100 }, { "epoch": 0.8776769942242423, "grad_norm": 32.6489882004861, "learning_rate": 2.244113280978452e-08, "logits/chosen": -2.71875, "logits/rejected": -2.9375, "logps/chosen": -684.0, "logps/rejected": -1192.0, "loss": 0.2117, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 5.03125, "rewards/rejected": -10.1875, "step": 22110 }, { "epoch": 0.8780739535160669, "grad_norm": 28.748091351042635, "learning_rate": 2.229789655543532e-08, "logits/chosen": -2.75, "logits/rejected": -3.015625, "logps/chosen": -648.0, "logps/rejected": -1176.0, "loss": 0.187, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.28125, "rewards/rejected": -10.0625, "step": 22120 }, { "epoch": 0.8784709128078916, "grad_norm": 26.575985395039712, "learning_rate": 2.2155097546602163e-08, "logits/chosen": -2.5625, "logits/rejected": -2.90625, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2042, "rewards/accuracies": 0.96875, "rewards/chosen": -4.9375, "rewards/margins": 5.15625, "rewards/rejected": -10.0625, "step": 22130 }, { "epoch": 0.8788678720997162, "grad_norm": 25.118591219628343, "learning_rate": 2.201273605749529e-08, "logits/chosen": -2.703125, "logits/rejected": -2.8125, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.1913, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.40625, "rewards/rejected": -10.25, "step": 22140 }, { "epoch": 0.8792648313915408, "grad_norm": 34.34622137725618, "learning_rate": 2.1870812361484665e-08, "logits/chosen": -2.625, "logits/rejected": -2.828125, "logps/chosen": -640.0, "logps/rejected": -1176.0, "loss": 0.2248, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.59375, "rewards/margins": 5.46875, "rewards/rejected": -10.0625, "step": 22150 }, { "epoch": 0.8796617906833655, "grad_norm": 14.238088352269514, "learning_rate": 2.1729326731099535e-08, "logits/chosen": -2.609375, "logits/rejected": -2.765625, "logps/chosen": -684.0, "logps/rejected": -1176.0, "loss": 0.2078, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.96875, "rewards/rejected": -10.0, "step": 22160 }, { "epoch": 0.8800587499751901, "grad_norm": 20.371477357837666, "learning_rate": 2.15882794380281e-08, "logits/chosen": -2.875, "logits/rejected": -3.0625, "logps/chosen": -696.0, "logps/rejected": -1208.0, "loss": 0.2128, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.28125, "rewards/margins": 5.0, "rewards/rejected": -10.3125, "step": 22170 }, { "epoch": 0.8804557092670147, "grad_norm": 24.944677012663828, "learning_rate": 2.144767075311682e-08, "logits/chosen": -2.703125, "logits/rejected": -2.828125, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.1968, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.0625, "rewards/margins": 5.1875, "rewards/rejected": -10.25, "step": 22180 }, { "epoch": 0.8808526685588393, "grad_norm": 39.21712481682777, "learning_rate": 2.1307500946369766e-08, "logits/chosen": -2.578125, "logits/rejected": -2.765625, "logps/chosen": -660.0, "logps/rejected": -1192.0, "loss": 0.2155, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 5.125, "rewards/rejected": -10.125, "step": 22190 }, { "epoch": 0.881249627850664, "grad_norm": 29.52038241596885, "learning_rate": 2.1167770286948383e-08, "logits/chosen": -2.640625, "logits/rejected": -2.640625, "logps/chosen": -684.0, "logps/rejected": -1224.0, "loss": 0.1918, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 5.34375, "rewards/rejected": -10.5, "step": 22200 }, { "epoch": 0.8816465871424886, "grad_norm": 41.694830820229676, "learning_rate": 2.102847904317076e-08, "logits/chosen": -2.65625, "logits/rejected": -2.6875, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.1982, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 22210 }, { "epoch": 0.8820435464343132, "grad_norm": 18.961285488847658, "learning_rate": 2.0889627482511303e-08, "logits/chosen": -2.625, "logits/rejected": -2.8125, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.1903, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.34375, "rewards/rejected": -10.1875, "step": 22220 }, { "epoch": 0.8824405057261377, "grad_norm": 26.760128113922264, "learning_rate": 2.0751215871600086e-08, "logits/chosen": -2.75, "logits/rejected": -3.015625, "logps/chosen": -672.0, "logps/rejected": -1152.0, "loss": 0.2118, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 4.875, "rewards/rejected": -10.0, "step": 22230 }, { "epoch": 0.8828374650179625, "grad_norm": 36.15983324498696, "learning_rate": 2.0613244476222287e-08, "logits/chosen": -2.703125, "logits/rejected": -3.15625, "logps/chosen": -688.0, "logps/rejected": -1152.0, "loss": 0.2135, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.125, "rewards/rejected": -10.125, "step": 22240 }, { "epoch": 0.883234424309787, "grad_norm": 19.636050450078848, "learning_rate": 2.047571356131783e-08, "logits/chosen": -2.796875, "logits/rejected": -3.15625, "logps/chosen": -652.0, "logps/rejected": -1184.0, "loss": 0.1843, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.4375, "rewards/rejected": -10.25, "step": 22250 }, { "epoch": 0.8836313836016116, "grad_norm": 35.65316862312713, "learning_rate": 2.0338623390980747e-08, "logits/chosen": -2.578125, "logits/rejected": -2.96875, "logps/chosen": -652.0, "logps/rejected": -1184.0, "loss": 0.2341, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.71875, "rewards/margins": 5.4375, "rewards/rejected": -10.1875, "step": 22260 }, { "epoch": 0.8840283428934362, "grad_norm": 34.928598222396964, "learning_rate": 2.020197422845879e-08, "logits/chosen": -2.59375, "logits/rejected": -3.015625, "logps/chosen": -704.0, "logps/rejected": -1216.0, "loss": 0.2442, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 5.25, "rewards/rejected": -10.4375, "step": 22270 }, { "epoch": 0.8844253021852609, "grad_norm": 18.883533778028774, "learning_rate": 2.0065766336152924e-08, "logits/chosen": -2.53125, "logits/rejected": -2.65625, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.1924, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.21875, "rewards/rejected": -10.125, "step": 22280 }, { "epoch": 0.8848222614770855, "grad_norm": 36.787752670937984, "learning_rate": 1.992999997561662e-08, "logits/chosen": -2.65625, "logits/rejected": -2.71875, "logps/chosen": -720.0, "logps/rejected": -1200.0, "loss": 0.1849, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3125, "rewards/margins": 4.78125, "rewards/rejected": -10.0625, "step": 22290 }, { "epoch": 0.8852192207689101, "grad_norm": 42.95676772996328, "learning_rate": 1.9794675407555556e-08, "logits/chosen": -2.671875, "logits/rejected": -3.03125, "logps/chosen": -704.0, "logps/rejected": -1216.0, "loss": 0.2199, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 5.15625, "rewards/rejected": -10.4375, "step": 22300 }, { "epoch": 0.8856161800607347, "grad_norm": 33.318048251520274, "learning_rate": 1.965979289182701e-08, "logits/chosen": -2.6875, "logits/rejected": -2.9375, "logps/chosen": -656.0, "logps/rejected": -1192.0, "loss": 0.1899, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.46875, "rewards/rejected": -10.4375, "step": 22310 }, { "epoch": 0.8860131393525594, "grad_norm": 38.208474591109685, "learning_rate": 1.9525352687439543e-08, "logits/chosen": -2.71875, "logits/rejected": -3.046875, "logps/chosen": -700.0, "logps/rejected": -1232.0, "loss": 0.2015, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.25, "rewards/margins": 5.34375, "rewards/rejected": -10.625, "step": 22320 }, { "epoch": 0.886410098644384, "grad_norm": 31.314476526132854, "learning_rate": 1.9391355052552295e-08, "logits/chosen": -2.6875, "logits/rejected": -2.890625, "logps/chosen": -696.0, "logps/rejected": -1208.0, "loss": 0.1946, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.21875, "rewards/margins": 5.15625, "rewards/rejected": -10.375, "step": 22330 }, { "epoch": 0.8868070579362086, "grad_norm": 26.491353949671208, "learning_rate": 1.9257800244474486e-08, "logits/chosen": -2.703125, "logits/rejected": -2.875, "logps/chosen": -680.0, "logps/rejected": -1160.0, "loss": 0.1826, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.8125, "rewards/rejected": -9.9375, "step": 22340 }, { "epoch": 0.8872040172280332, "grad_norm": 28.038337005856278, "learning_rate": 1.912468851966506e-08, "logits/chosen": -2.703125, "logits/rejected": -2.8125, "logps/chosen": -676.0, "logps/rejected": -1208.0, "loss": 0.2218, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.46875, "rewards/rejected": -10.5, "step": 22350 }, { "epoch": 0.8876009765198579, "grad_norm": 25.996489090269563, "learning_rate": 1.8992020133732217e-08, "logits/chosen": -2.765625, "logits/rejected": -3.140625, "logps/chosen": -688.0, "logps/rejected": -1216.0, "loss": 0.1697, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.125, "rewards/margins": 5.34375, "rewards/rejected": -10.4375, "step": 22360 }, { "epoch": 0.8879979358116825, "grad_norm": 24.279097667305702, "learning_rate": 1.8859795341432617e-08, "logits/chosen": -2.734375, "logits/rejected": -2.890625, "logps/chosen": -668.0, "logps/rejected": -1216.0, "loss": 0.1612, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 5.59375, "rewards/rejected": -10.625, "step": 22370 }, { "epoch": 0.8883948951035071, "grad_norm": 18.308441662481773, "learning_rate": 1.8728014396671348e-08, "logits/chosen": -2.640625, "logits/rejected": -3.0625, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.2225, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.09375, "rewards/margins": 5.46875, "rewards/rejected": -10.5625, "step": 22380 }, { "epoch": 0.8887918543953317, "grad_norm": 40.30439387657173, "learning_rate": 1.859667755250105e-08, "logits/chosen": -2.875, "logits/rejected": -3.109375, "logps/chosen": -712.0, "logps/rejected": -1144.0, "loss": 0.2486, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.375, "rewards/margins": 4.53125, "rewards/rejected": -9.875, "step": 22390 }, { "epoch": 0.8891888136871564, "grad_norm": 26.888708664247847, "learning_rate": 1.8465785061121592e-08, "logits/chosen": -2.6875, "logits/rejected": -2.953125, "logps/chosen": -692.0, "logps/rejected": -1216.0, "loss": 0.216, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 5.34375, "rewards/rejected": -10.4375, "step": 22400 }, { "epoch": 0.889585772978981, "grad_norm": 28.252299815590185, "learning_rate": 1.8335337173879638e-08, "logits/chosen": -2.703125, "logits/rejected": -2.859375, "logps/chosen": -676.0, "logps/rejected": -1208.0, "loss": 0.1818, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.46875, "rewards/rejected": -10.4375, "step": 22410 }, { "epoch": 0.8899827322708056, "grad_norm": 19.920716338809733, "learning_rate": 1.8205334141268013e-08, "logits/chosen": -2.78125, "logits/rejected": -3.015625, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2442, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.0, "rewards/rejected": -10.0, "step": 22420 }, { "epoch": 0.8903796915626303, "grad_norm": 29.455199137887064, "learning_rate": 1.8075776212925416e-08, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -688.0, "logps/rejected": -1184.0, "loss": 0.2431, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.03125, "rewards/rejected": -10.125, "step": 22430 }, { "epoch": 0.8907766508544549, "grad_norm": 28.058243136723046, "learning_rate": 1.7946663637635722e-08, "logits/chosen": -2.75, "logits/rejected": -3.125, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2337, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -4.875, "rewards/margins": 4.9375, "rewards/rejected": -9.8125, "step": 22440 }, { "epoch": 0.8911736101462795, "grad_norm": 39.64748097142431, "learning_rate": 1.7817996663327674e-08, "logits/chosen": -2.828125, "logits/rejected": -2.8125, "logps/chosen": -648.0, "logps/rejected": -1136.0, "loss": 0.1962, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.875, "rewards/rejected": -9.75, "step": 22450 }, { "epoch": 0.8915705694381041, "grad_norm": 12.30512065698449, "learning_rate": 1.7689775537074375e-08, "logits/chosen": -2.734375, "logits/rejected": -2.921875, "logps/chosen": -652.0, "logps/rejected": -1184.0, "loss": 0.2187, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 5.1875, "rewards/rejected": -10.1875, "step": 22460 }, { "epoch": 0.8919675287299288, "grad_norm": 21.216458021733057, "learning_rate": 1.756200050509271e-08, "logits/chosen": -2.703125, "logits/rejected": -2.9375, "logps/chosen": -688.0, "logps/rejected": -1192.0, "loss": 0.2044, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.09375, "rewards/margins": 5.125, "rewards/rejected": -10.25, "step": 22470 }, { "epoch": 0.8923644880217534, "grad_norm": 16.295533827932083, "learning_rate": 1.7434671812743023e-08, "logits/chosen": -2.6875, "logits/rejected": -2.9375, "logps/chosen": -672.0, "logps/rejected": -1176.0, "loss": 0.1915, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.21875, "rewards/rejected": -10.25, "step": 22480 }, { "epoch": 0.892761447313578, "grad_norm": 29.313342596709592, "learning_rate": 1.730778970452851e-08, "logits/chosen": -2.75, "logits/rejected": -2.9375, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.2083, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.15625, "rewards/rejected": -10.0625, "step": 22490 }, { "epoch": 0.8931584066054026, "grad_norm": 31.815354712483565, "learning_rate": 1.7181354424094908e-08, "logits/chosen": -2.546875, "logits/rejected": -2.796875, "logps/chosen": -668.0, "logps/rejected": -1192.0, "loss": 0.2318, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 22500 }, { "epoch": 0.8935553658972273, "grad_norm": 31.946596608297277, "learning_rate": 1.7055366214229843e-08, "logits/chosen": -2.671875, "logits/rejected": -3.0625, "logps/chosen": -660.0, "logps/rejected": -1136.0, "loss": 0.2208, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 4.8125, "rewards/rejected": -9.8125, "step": 22510 }, { "epoch": 0.8939523251890519, "grad_norm": 18.220292041972293, "learning_rate": 1.6929825316862433e-08, "logits/chosen": -2.734375, "logits/rejected": -2.921875, "logps/chosen": -688.0, "logps/rejected": -1208.0, "loss": 0.1857, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.40625, "rewards/rejected": -10.4375, "step": 22520 }, { "epoch": 0.8943492844808765, "grad_norm": 15.61306813911572, "learning_rate": 1.680473197306298e-08, "logits/chosen": -2.5625, "logits/rejected": -2.578125, "logps/chosen": -724.0, "logps/rejected": -1200.0, "loss": 0.177, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.4375, "rewards/margins": 4.90625, "rewards/rejected": -10.375, "step": 22530 }, { "epoch": 0.8947462437727011, "grad_norm": 33.991932545439134, "learning_rate": 1.6680086423042167e-08, "logits/chosen": -2.890625, "logits/rejected": -3.046875, "logps/chosen": -676.0, "logps/rejected": -1200.0, "loss": 0.2305, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 5.375, "rewards/rejected": -10.4375, "step": 22540 }, { "epoch": 0.8951432030645258, "grad_norm": 17.164118416327096, "learning_rate": 1.6555888906151033e-08, "logits/chosen": -2.75, "logits/rejected": -3.015625, "logps/chosen": -676.0, "logps/rejected": -1192.0, "loss": 0.1883, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.3125, "rewards/rejected": -10.25, "step": 22550 }, { "epoch": 0.8955401623563504, "grad_norm": 41.159499366712424, "learning_rate": 1.6432139660880085e-08, "logits/chosen": -2.796875, "logits/rejected": -2.953125, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.1877, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.40625, "rewards/rejected": -10.25, "step": 22560 }, { "epoch": 0.895937121648175, "grad_norm": 25.217440773212076, "learning_rate": 1.6308838924859154e-08, "logits/chosen": -2.8125, "logits/rejected": -3.0625, "logps/chosen": -688.0, "logps/rejected": -1216.0, "loss": 0.183, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.1875, "rewards/margins": 5.34375, "rewards/rejected": -10.5, "step": 22570 }, { "epoch": 0.8963340809399996, "grad_norm": 25.05153375738368, "learning_rate": 1.6185986934856704e-08, "logits/chosen": -2.828125, "logits/rejected": -2.90625, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.1972, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.8125, "rewards/rejected": -10.0, "step": 22580 }, { "epoch": 0.8967310402318243, "grad_norm": 35.996410215216756, "learning_rate": 1.6063583926779617e-08, "logits/chosen": -2.734375, "logits/rejected": -2.953125, "logps/chosen": -656.0, "logps/rejected": -1216.0, "loss": 0.2254, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.875, "rewards/margins": 5.53125, "rewards/rejected": -10.4375, "step": 22590 }, { "epoch": 0.8971279995236489, "grad_norm": 54.74101659745568, "learning_rate": 1.5941630135672595e-08, "logits/chosen": -2.609375, "logits/rejected": -3.015625, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2327, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.28125, "rewards/rejected": -10.0625, "step": 22600 }, { "epoch": 0.8975249588154735, "grad_norm": 34.988385427646264, "learning_rate": 1.5820125795717705e-08, "logits/chosen": -2.71875, "logits/rejected": -2.921875, "logps/chosen": -672.0, "logps/rejected": -1216.0, "loss": 0.2159, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.28125, "rewards/rejected": -10.3125, "step": 22610 }, { "epoch": 0.897921918107298, "grad_norm": 30.200560110201394, "learning_rate": 1.5699071140233888e-08, "logits/chosen": -2.640625, "logits/rejected": -2.953125, "logps/chosen": -648.0, "logps/rejected": -1160.0, "loss": 0.2301, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.0, "rewards/rejected": -9.9375, "step": 22620 }, { "epoch": 0.8983188773991228, "grad_norm": 27.57465322459346, "learning_rate": 1.5578466401676648e-08, "logits/chosen": -2.71875, "logits/rejected": -2.96875, "logps/chosen": -640.0, "logps/rejected": -1160.0, "loss": 0.1985, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.65625, "rewards/margins": 5.1875, "rewards/rejected": -9.875, "step": 22630 }, { "epoch": 0.8987158366909473, "grad_norm": 25.72712425589527, "learning_rate": 1.5458311811637537e-08, "logits/chosen": -2.65625, "logits/rejected": -2.90625, "logps/chosen": -668.0, "logps/rejected": -1208.0, "loss": 0.2285, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.375, "rewards/rejected": -10.4375, "step": 22640 }, { "epoch": 0.8991127959827719, "grad_norm": 34.02880991857904, "learning_rate": 1.533860760084374e-08, "logits/chosen": -2.546875, "logits/rejected": -2.71875, "logps/chosen": -704.0, "logps/rejected": -1240.0, "loss": 0.2158, "rewards/accuracies": 0.96875, "rewards/chosen": -5.125, "rewards/margins": 5.40625, "rewards/rejected": -10.5625, "step": 22650 }, { "epoch": 0.8995097552745966, "grad_norm": 36.1279766398002, "learning_rate": 1.5219353999157526e-08, "logits/chosen": -2.765625, "logits/rejected": -3.03125, "logps/chosen": -644.0, "logps/rejected": -1184.0, "loss": 0.2011, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.3125, "rewards/rejected": -10.1875, "step": 22660 }, { "epoch": 0.8999067145664212, "grad_norm": 26.94930056569843, "learning_rate": 1.510055123557588e-08, "logits/chosen": -2.78125, "logits/rejected": -3.09375, "logps/chosen": -684.0, "logps/rejected": -1152.0, "loss": 0.2269, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.25, "rewards/margins": 4.78125, "rewards/rejected": -10.0, "step": 22670 }, { "epoch": 0.9003036738582458, "grad_norm": 14.404112242562825, "learning_rate": 1.4982199538230128e-08, "logits/chosen": -2.75, "logits/rejected": -2.953125, "logps/chosen": -688.0, "logps/rejected": -1200.0, "loss": 0.2027, "rewards/accuracies": 0.96875, "rewards/chosen": -5.25, "rewards/margins": 5.09375, "rewards/rejected": -10.3125, "step": 22680 }, { "epoch": 0.9007006331500704, "grad_norm": 39.71742765022404, "learning_rate": 1.4864299134385444e-08, "logits/chosen": -2.65625, "logits/rejected": -2.9375, "logps/chosen": -688.0, "logps/rejected": -1216.0, "loss": 0.1842, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 5.4375, "rewards/rejected": -10.5, "step": 22690 }, { "epoch": 0.9010975924418951, "grad_norm": 25.615481899167932, "learning_rate": 1.474685025044034e-08, "logits/chosen": -2.765625, "logits/rejected": -2.890625, "logps/chosen": -680.0, "logps/rejected": -1224.0, "loss": 0.2088, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -5.09375, "rewards/margins": 5.40625, "rewards/rejected": -10.5, "step": 22700 }, { "epoch": 0.9014945517337197, "grad_norm": 30.909969542349, "learning_rate": 1.4629853111926299e-08, "logits/chosen": -2.671875, "logits/rejected": -2.8125, "logps/chosen": -660.0, "logps/rejected": -1256.0, "loss": 0.1834, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.84375, "rewards/rejected": -10.6875, "step": 22710 }, { "epoch": 0.9018915110255443, "grad_norm": 30.894357531706596, "learning_rate": 1.4513307943507303e-08, "logits/chosen": -2.6875, "logits/rejected": -2.703125, "logps/chosen": -668.0, "logps/rejected": -1256.0, "loss": 0.1988, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.6875, "rewards/margins": 5.90625, "rewards/rejected": -10.5625, "step": 22720 }, { "epoch": 0.9022884703173689, "grad_norm": 24.322374759726568, "learning_rate": 1.4397214968979581e-08, "logits/chosen": -2.546875, "logits/rejected": -2.953125, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.1983, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.34375, "rewards/rejected": -10.0625, "step": 22730 }, { "epoch": 0.9026854296091936, "grad_norm": 26.99418175273592, "learning_rate": 1.428157441127098e-08, "logits/chosen": -2.65625, "logits/rejected": -2.78125, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.1693, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.28125, "rewards/rejected": -10.1875, "step": 22740 }, { "epoch": 0.9030823889010182, "grad_norm": 24.61326949495997, "learning_rate": 1.4166386492440479e-08, "logits/chosen": -2.515625, "logits/rejected": -3.03125, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.2308, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 5.21875, "rewards/rejected": -10.25, "step": 22750 }, { "epoch": 0.9034793481928428, "grad_norm": 15.142507588722701, "learning_rate": 1.405165143367798e-08, "logits/chosen": -2.703125, "logits/rejected": -2.90625, "logps/chosen": -692.0, "logps/rejected": -1272.0, "loss": 0.2027, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 5.5, "rewards/rejected": -10.625, "step": 22760 }, { "epoch": 0.9038763074846674, "grad_norm": 31.247139638625036, "learning_rate": 1.3937369455303744e-08, "logits/chosen": -2.8125, "logits/rejected": -3.0, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.1932, "rewards/accuracies": 0.96875, "rewards/chosen": -5.03125, "rewards/margins": 4.96875, "rewards/rejected": -10.0, "step": 22770 }, { "epoch": 0.9042732667764921, "grad_norm": 21.382085860859927, "learning_rate": 1.3823540776768033e-08, "logits/chosen": -2.796875, "logits/rejected": -3.015625, "logps/chosen": -676.0, "logps/rejected": -1216.0, "loss": 0.1969, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.4375, "rewards/rejected": -10.5, "step": 22780 }, { "epoch": 0.9046702260683167, "grad_norm": 23.02961203166156, "learning_rate": 1.3710165616650699e-08, "logits/chosen": -2.625, "logits/rejected": -3.0, "logps/chosen": -660.0, "logps/rejected": -1208.0, "loss": 0.2026, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.875, "rewards/margins": 5.5, "rewards/rejected": -10.375, "step": 22790 }, { "epoch": 0.9050671853601413, "grad_norm": 117.8181586479366, "learning_rate": 1.359724419266059e-08, "logits/chosen": -2.546875, "logits/rejected": -2.8125, "logps/chosen": -652.0, "logps/rejected": -1184.0, "loss": 0.2267, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.75, "rewards/margins": 5.53125, "rewards/rejected": -10.25, "step": 22800 }, { "epoch": 0.9054641446519659, "grad_norm": 25.262517931593862, "learning_rate": 1.3484776721635393e-08, "logits/chosen": -2.65625, "logits/rejected": -2.65625, "logps/chosen": -688.0, "logps/rejected": -1232.0, "loss": 0.2021, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 5.375, "rewards/rejected": -10.3125, "step": 22810 }, { "epoch": 0.9058611039437906, "grad_norm": 27.03892943049932, "learning_rate": 1.3372763419540966e-08, "logits/chosen": -2.65625, "logits/rejected": -2.90625, "logps/chosen": -660.0, "logps/rejected": -1272.0, "loss": 0.1793, "rewards/accuracies": 1.0, "rewards/chosen": -4.75, "rewards/margins": 6.03125, "rewards/rejected": -10.8125, "step": 22820 }, { "epoch": 0.9062580632356152, "grad_norm": 32.1248103241936, "learning_rate": 1.3261204501471225e-08, "logits/chosen": -2.78125, "logits/rejected": -2.859375, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.2054, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.15625, "rewards/rejected": -10.1875, "step": 22830 }, { "epoch": 0.9066550225274398, "grad_norm": 31.93788086378247, "learning_rate": 1.3150100181647333e-08, "logits/chosen": -2.65625, "logits/rejected": -3.078125, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.2113, "rewards/accuracies": 0.96875, "rewards/chosen": -5.21875, "rewards/margins": 5.09375, "rewards/rejected": -10.3125, "step": 22840 }, { "epoch": 0.9070519818192644, "grad_norm": 27.1559008108912, "learning_rate": 1.3039450673417773e-08, "logits/chosen": -2.671875, "logits/rejected": -2.84375, "logps/chosen": -664.0, "logps/rejected": -1192.0, "loss": 0.222, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.375, "rewards/rejected": -10.125, "step": 22850 }, { "epoch": 0.9074489411110891, "grad_norm": 26.348372645448496, "learning_rate": 1.2929256189257415e-08, "logits/chosen": -2.625, "logits/rejected": -2.84375, "logps/chosen": -704.0, "logps/rejected": -1200.0, "loss": 0.2077, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.15625, "rewards/margins": 5.1875, "rewards/rejected": -10.3125, "step": 22860 }, { "epoch": 0.9078459004029137, "grad_norm": 18.924587468107173, "learning_rate": 1.2819516940767578e-08, "logits/chosen": -2.765625, "logits/rejected": -3.046875, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2043, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.78125, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 22870 }, { "epoch": 0.9082428596947383, "grad_norm": 25.29710015920818, "learning_rate": 1.2710233138675252e-08, "logits/chosen": -2.71875, "logits/rejected": -2.9375, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.2085, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.4375, "rewards/rejected": -10.25, "step": 22880 }, { "epoch": 0.9086398189865629, "grad_norm": 35.661124962456924, "learning_rate": 1.2601404992832908e-08, "logits/chosen": -2.59375, "logits/rejected": -2.9375, "logps/chosen": -660.0, "logps/rejected": -1144.0, "loss": 0.1985, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.21875, "rewards/rejected": -10.0, "step": 22890 }, { "epoch": 0.9090367782783876, "grad_norm": 23.8501280816141, "learning_rate": 1.2493032712218133e-08, "logits/chosen": -2.609375, "logits/rejected": -2.84375, "logps/chosen": -692.0, "logps/rejected": -1176.0, "loss": 0.1659, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 4.96875, "rewards/rejected": -10.0, "step": 22900 }, { "epoch": 0.9094337375702122, "grad_norm": 27.789827779888153, "learning_rate": 1.2385116504932958e-08, "logits/chosen": -2.59375, "logits/rejected": -2.921875, "logps/chosen": -660.0, "logps/rejected": -1160.0, "loss": 0.2312, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 22910 }, { "epoch": 0.9098306968620368, "grad_norm": 31.65283022338293, "learning_rate": 1.227765657820387e-08, "logits/chosen": -2.609375, "logits/rejected": -2.6875, "logps/chosen": -652.0, "logps/rejected": -1152.0, "loss": 0.1876, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.875, "rewards/margins": 4.96875, "rewards/rejected": -9.8125, "step": 22920 }, { "epoch": 0.9102276561538615, "grad_norm": 36.41967732621722, "learning_rate": 1.2170653138380976e-08, "logits/chosen": -2.71875, "logits/rejected": -3.0, "logps/chosen": -636.0, "logps/rejected": -1160.0, "loss": 0.2103, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.6875, "rewards/margins": 5.15625, "rewards/rejected": -9.875, "step": 22930 }, { "epoch": 0.9106246154456861, "grad_norm": 27.449743446686647, "learning_rate": 1.2064106390937856e-08, "logits/chosen": -2.703125, "logits/rejected": -3.078125, "logps/chosen": -656.0, "logps/rejected": -1208.0, "loss": 0.1843, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.65625, "rewards/rejected": -10.5625, "step": 22940 }, { "epoch": 0.9110215747375107, "grad_norm": 34.62404917515111, "learning_rate": 1.1958016540471238e-08, "logits/chosen": -2.671875, "logits/rejected": -2.9375, "logps/chosen": -664.0, "logps/rejected": -1120.0, "loss": 0.2255, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.09375, "rewards/margins": 4.6875, "rewards/rejected": -9.8125, "step": 22950 }, { "epoch": 0.9114185340293353, "grad_norm": 24.129475011579512, "learning_rate": 1.185238379070036e-08, "logits/chosen": -2.640625, "logits/rejected": -2.6875, "logps/chosen": -688.0, "logps/rejected": -1192.0, "loss": 0.1775, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.0, "rewards/margins": 5.1875, "rewards/rejected": -10.1875, "step": 22960 }, { "epoch": 0.91181549332116, "grad_norm": 20.576512169522058, "learning_rate": 1.1747208344466825e-08, "logits/chosen": -2.65625, "logits/rejected": -2.78125, "logps/chosen": -676.0, "logps/rejected": -1176.0, "loss": 0.1942, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 22970 }, { "epoch": 0.9122124526129846, "grad_norm": 21.748439284681783, "learning_rate": 1.1642490403733996e-08, "logits/chosen": -2.703125, "logits/rejected": -2.875, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2006, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 5.25, "rewards/rejected": -10.1875, "step": 22980 }, { "epoch": 0.9126094119048092, "grad_norm": 35.28114848001062, "learning_rate": 1.1538230169586716e-08, "logits/chosen": -2.859375, "logits/rejected": -2.875, "logps/chosen": -648.0, "logps/rejected": -1168.0, "loss": 0.2344, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.21875, "rewards/rejected": -10.0, "step": 22990 }, { "epoch": 0.9130063711966337, "grad_norm": 15.089410584909318, "learning_rate": 1.1434427842231004e-08, "logits/chosen": -2.59375, "logits/rejected": -2.890625, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.1944, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.25, "rewards/rejected": -10.25, "step": 23000 }, { "epoch": 0.9130063711966337, "eval_logits/chosen": -2.703125, "eval_logits/rejected": -2.9375, "eval_logps/chosen": -712.0, "eval_logps/rejected": -1120.0, "eval_loss": 0.251930296421051, "eval_rewards/accuracies": 0.8955162763595581, "eval_rewards/chosen": -5.25, "eval_rewards/margins": 4.28125, "eval_rewards/rejected": -9.5625, "eval_runtime": 5408.3512, "eval_samples_per_second": 32.66, "eval_steps_per_second": 0.51, "step": 23000 }, { "epoch": 0.9134033304884585, "grad_norm": 40.5511181099884, "learning_rate": 1.1331083620993554e-08, "logits/chosen": -2.625, "logits/rejected": -2.765625, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2341, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.125, "rewards/rejected": -10.0625, "step": 23010 }, { "epoch": 0.913800289780283, "grad_norm": 21.87633526136623, "learning_rate": 1.1228197704321319e-08, "logits/chosen": -2.71875, "logits/rejected": -2.828125, "logps/chosen": -656.0, "logps/rejected": -1216.0, "loss": 0.183, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.0, "rewards/margins": 5.40625, "rewards/rejected": -10.375, "step": 23020 }, { "epoch": 0.9141972490721076, "grad_norm": 24.47573695380781, "learning_rate": 1.1125770289781233e-08, "logits/chosen": -2.796875, "logits/rejected": -2.890625, "logps/chosen": -684.0, "logps/rejected": -1232.0, "loss": 0.2108, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.0625, "rewards/margins": 5.59375, "rewards/rejected": -10.625, "step": 23030 }, { "epoch": 0.9145942083639322, "grad_norm": 23.639347084399503, "learning_rate": 1.1023801574059743e-08, "logits/chosen": -2.765625, "logits/rejected": -2.765625, "logps/chosen": -648.0, "logps/rejected": -1264.0, "loss": 0.2277, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.84375, "rewards/rejected": -10.75, "step": 23040 }, { "epoch": 0.9149911676557569, "grad_norm": 32.95036613064561, "learning_rate": 1.0922291752962582e-08, "logits/chosen": -2.59375, "logits/rejected": -2.90625, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.2596, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.25, "rewards/rejected": -10.1875, "step": 23050 }, { "epoch": 0.9153881269475815, "grad_norm": 23.140540240216215, "learning_rate": 1.0821241021414267e-08, "logits/chosen": -2.75, "logits/rejected": -2.984375, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.1965, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.3125, "rewards/rejected": -10.25, "step": 23060 }, { "epoch": 0.9157850862394061, "grad_norm": 26.391782167510407, "learning_rate": 1.0720649573457642e-08, "logits/chosen": -2.65625, "logits/rejected": -2.96875, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2007, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 23070 }, { "epoch": 0.9161820455312307, "grad_norm": 31.5584435114591, "learning_rate": 1.0620517602253686e-08, "logits/chosen": -2.765625, "logits/rejected": -3.0, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2108, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 23080 }, { "epoch": 0.9165790048230554, "grad_norm": 34.03657384010724, "learning_rate": 1.0520845300081045e-08, "logits/chosen": -2.734375, "logits/rejected": -2.953125, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2033, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 23090 }, { "epoch": 0.91697596411488, "grad_norm": 13.620129024663676, "learning_rate": 1.0421632858335749e-08, "logits/chosen": -2.609375, "logits/rejected": -2.953125, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.2277, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.78125, "rewards/rejected": -9.8125, "step": 23100 }, { "epoch": 0.9173729234067046, "grad_norm": 30.547725030624104, "learning_rate": 1.0322880467530715e-08, "logits/chosen": -2.640625, "logits/rejected": -2.65625, "logps/chosen": -652.0, "logps/rejected": -1232.0, "loss": 0.1886, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.875, "rewards/rejected": -10.75, "step": 23110 }, { "epoch": 0.9177698826985292, "grad_norm": 34.06210358680329, "learning_rate": 1.0224588317295447e-08, "logits/chosen": -2.796875, "logits/rejected": -2.828125, "logps/chosen": -648.0, "logps/rejected": -1184.0, "loss": 0.2202, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.375, "rewards/rejected": -10.25, "step": 23120 }, { "epoch": 0.9181668419903539, "grad_norm": 19.23619461675806, "learning_rate": 1.0126756596375685e-08, "logits/chosen": -2.625, "logits/rejected": -2.703125, "logps/chosen": -684.0, "logps/rejected": -1192.0, "loss": 0.1743, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0, "rewards/margins": 5.125, "rewards/rejected": -10.125, "step": 23130 }, { "epoch": 0.9185638012821785, "grad_norm": 29.579292761861005, "learning_rate": 1.0029385492632991e-08, "logits/chosen": -2.703125, "logits/rejected": -2.9375, "logps/chosen": -672.0, "logps/rejected": -1144.0, "loss": 0.2412, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 4.8125, "rewards/rejected": -9.8125, "step": 23140 }, { "epoch": 0.9189607605740031, "grad_norm": 44.42345139285817, "learning_rate": 9.932475193044521e-09, "logits/chosen": -2.796875, "logits/rejected": -2.953125, "logps/chosen": -668.0, "logps/rejected": -1208.0, "loss": 0.2008, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.40625, "rewards/rejected": -10.1875, "step": 23150 }, { "epoch": 0.9193577198658277, "grad_norm": 22.375003717084162, "learning_rate": 9.836025883702526e-09, "logits/chosen": -2.609375, "logits/rejected": -2.828125, "logps/chosen": -676.0, "logps/rejected": -1232.0, "loss": 0.2208, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.59375, "rewards/rejected": -10.5, "step": 23160 }, { "epoch": 0.9197546791576524, "grad_norm": 26.658082404816273, "learning_rate": 9.740037749813995e-09, "logits/chosen": -2.640625, "logits/rejected": -2.953125, "logps/chosen": -688.0, "logps/rejected": -1288.0, "loss": 0.1684, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.5625, "rewards/rejected": -10.625, "step": 23170 }, { "epoch": 0.920151638449477, "grad_norm": 20.783930824442223, "learning_rate": 9.644510975700376e-09, "logits/chosen": -2.65625, "logits/rejected": -2.984375, "logps/chosen": -668.0, "logps/rejected": -1216.0, "loss": 0.1885, "rewards/accuracies": 0.96875, "rewards/chosen": -4.9375, "rewards/margins": 5.5625, "rewards/rejected": -10.5, "step": 23180 }, { "epoch": 0.9205485977413016, "grad_norm": 25.135020880334526, "learning_rate": 9.549445744797157e-09, "logits/chosen": -2.59375, "logits/rejected": -2.90625, "logps/chosen": -648.0, "logps/rejected": -1200.0, "loss": 0.189, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 5.5, "rewards/rejected": -10.1875, "step": 23190 }, { "epoch": 0.9209455570331263, "grad_norm": 26.507207823893392, "learning_rate": 9.454842239653593e-09, "logits/chosen": -2.75, "logits/rejected": -2.921875, "logps/chosen": -720.0, "logps/rejected": -1224.0, "loss": 0.1961, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.375, "rewards/margins": 5.125, "rewards/rejected": -10.5, "step": 23200 }, { "epoch": 0.9213425163249509, "grad_norm": 20.885372874201547, "learning_rate": 9.36070064193234e-09, "logits/chosen": -2.6875, "logits/rejected": -2.9375, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.2021, "rewards/accuracies": 0.96875, "rewards/chosen": -4.9375, "rewards/margins": 5.5, "rewards/rejected": -10.4375, "step": 23210 }, { "epoch": 0.9217394756167755, "grad_norm": 31.37408696194007, "learning_rate": 9.267021132409014e-09, "logits/chosen": -2.703125, "logits/rejected": -2.9375, "logps/chosen": -668.0, "logps/rejected": -1216.0, "loss": 0.186, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.875, "rewards/margins": 5.65625, "rewards/rejected": -10.5625, "step": 23220 }, { "epoch": 0.9221364349086001, "grad_norm": 28.571996878060634, "learning_rate": 9.173803890971888e-09, "logits/chosen": -2.71875, "logits/rejected": -3.0625, "logps/chosen": -676.0, "logps/rejected": -1232.0, "loss": 0.2084, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 5.5625, "rewards/rejected": -10.625, "step": 23230 }, { "epoch": 0.9225333942004248, "grad_norm": 20.439409504322526, "learning_rate": 9.081049096621607e-09, "logits/chosen": -2.71875, "logits/rejected": -3.1875, "logps/chosen": -676.0, "logps/rejected": -1232.0, "loss": 0.205, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.0625, "rewards/margins": 5.625, "rewards/rejected": -10.6875, "step": 23240 }, { "epoch": 0.9229303534922494, "grad_norm": 37.43367963260724, "learning_rate": 8.988756927470781e-09, "logits/chosen": -2.71875, "logits/rejected": -2.828125, "logps/chosen": -688.0, "logps/rejected": -1184.0, "loss": 0.1984, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.15625, "rewards/margins": 5.0, "rewards/rejected": -10.1875, "step": 23250 }, { "epoch": 0.923327312784074, "grad_norm": 39.80566676322997, "learning_rate": 8.896927560743727e-09, "logits/chosen": -2.765625, "logits/rejected": -2.8125, "logps/chosen": -664.0, "logps/rejected": -1248.0, "loss": 0.2273, "rewards/accuracies": 0.96875, "rewards/chosen": -4.9375, "rewards/margins": 5.75, "rewards/rejected": -10.6875, "step": 23260 }, { "epoch": 0.9237242720758986, "grad_norm": 31.489482180326693, "learning_rate": 8.80556117277595e-09, "logits/chosen": -2.703125, "logits/rejected": -2.75, "logps/chosen": -636.0, "logps/rejected": -1168.0, "loss": 0.1958, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.15625, "rewards/rejected": -9.9375, "step": 23270 }, { "epoch": 0.9241212313677233, "grad_norm": 19.202211900632307, "learning_rate": 8.714657939013993e-09, "logits/chosen": -2.765625, "logits/rejected": -3.140625, "logps/chosen": -688.0, "logps/rejected": -1160.0, "loss": 0.208, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 23280 }, { "epoch": 0.9245181906595479, "grad_norm": 27.811178235123933, "learning_rate": 8.624218034015002e-09, "logits/chosen": -2.671875, "logits/rejected": -3.03125, "logps/chosen": -708.0, "logps/rejected": -1240.0, "loss": 0.225, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.21875, "rewards/margins": 5.53125, "rewards/rejected": -10.75, "step": 23290 }, { "epoch": 0.9249151499513725, "grad_norm": 27.98492188259847, "learning_rate": 8.534241631446449e-09, "logits/chosen": -2.609375, "logits/rejected": -2.953125, "logps/chosen": -644.0, "logps/rejected": -1208.0, "loss": 0.2066, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.6875, "rewards/margins": 5.75, "rewards/rejected": -10.4375, "step": 23300 }, { "epoch": 0.9253121092431971, "grad_norm": 28.2949669052452, "learning_rate": 8.444728904085735e-09, "logits/chosen": -2.734375, "logits/rejected": -2.9375, "logps/chosen": -656.0, "logps/rejected": -1176.0, "loss": 0.2315, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.9375, "rewards/margins": 5.21875, "rewards/rejected": -10.125, "step": 23310 }, { "epoch": 0.9257090685350218, "grad_norm": 28.101428639091775, "learning_rate": 8.355680023819866e-09, "logits/chosen": -2.703125, "logits/rejected": -3.0625, "logps/chosen": -684.0, "logps/rejected": -1152.0, "loss": 0.1849, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.875, "rewards/rejected": -10.0, "step": 23320 }, { "epoch": 0.9261060278268464, "grad_norm": 27.149312089709245, "learning_rate": 8.2670951616452e-09, "logits/chosen": -2.640625, "logits/rejected": -2.765625, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.1828, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.25, "rewards/rejected": -10.125, "step": 23330 }, { "epoch": 0.926502987118671, "grad_norm": 31.694435318743153, "learning_rate": 8.178974487667023e-09, "logits/chosen": -2.78125, "logits/rejected": -2.96875, "logps/chosen": -696.0, "logps/rejected": -1176.0, "loss": 0.2286, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.21875, "rewards/margins": 5.0625, "rewards/rejected": -10.25, "step": 23340 }, { "epoch": 0.9268999464104956, "grad_norm": 27.60596177765539, "learning_rate": 8.091318171099233e-09, "logits/chosen": -2.75, "logits/rejected": -2.90625, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.2251, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 4.96875, "rewards/rejected": -10.0, "step": 23350 }, { "epoch": 0.9272969057023203, "grad_norm": 32.11603312890564, "learning_rate": 8.004126380264153e-09, "logits/chosen": -2.65625, "logits/rejected": -2.71875, "logps/chosen": -676.0, "logps/rejected": -1224.0, "loss": 0.1899, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0, "rewards/margins": 5.28125, "rewards/rejected": -10.25, "step": 23360 }, { "epoch": 0.9276938649941449, "grad_norm": 31.205155317774288, "learning_rate": 7.91739928259197e-09, "logits/chosen": -2.8125, "logits/rejected": -3.078125, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.2138, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.15625, "rewards/margins": 5.21875, "rewards/rejected": -10.375, "step": 23370 }, { "epoch": 0.9280908242859695, "grad_norm": 30.64774884163458, "learning_rate": 7.831137044620683e-09, "logits/chosen": -2.796875, "logits/rejected": -3.046875, "logps/chosen": -688.0, "logps/rejected": -1192.0, "loss": 0.2545, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 5.03125, "rewards/rejected": -10.1875, "step": 23380 }, { "epoch": 0.928487783577794, "grad_norm": 24.045311520571957, "learning_rate": 7.74533983199549e-09, "logits/chosen": -2.671875, "logits/rejected": -2.984375, "logps/chosen": -668.0, "logps/rejected": -1208.0, "loss": 0.1644, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.875, "rewards/margins": 5.46875, "rewards/rejected": -10.375, "step": 23390 }, { "epoch": 0.9288847428696188, "grad_norm": 25.03979555629566, "learning_rate": 7.660007809468738e-09, "logits/chosen": -2.6875, "logits/rejected": -2.875, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.2201, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.15625, "rewards/rejected": -10.0625, "step": 23400 }, { "epoch": 0.9292817021614433, "grad_norm": 17.09286515584699, "learning_rate": 7.575141140899383e-09, "logits/chosen": -2.6875, "logits/rejected": -2.84375, "logps/chosen": -696.0, "logps/rejected": -1248.0, "loss": 0.2042, "rewards/accuracies": 0.96875, "rewards/chosen": -5.15625, "rewards/margins": 5.34375, "rewards/rejected": -10.5, "step": 23410 }, { "epoch": 0.9296786614532679, "grad_norm": 19.121205994347324, "learning_rate": 7.490739989252926e-09, "logits/chosen": -2.625, "logits/rejected": -2.859375, "logps/chosen": -652.0, "logps/rejected": -1208.0, "loss": 0.1989, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.84375, "rewards/margins": 5.46875, "rewards/rejected": -10.3125, "step": 23420 }, { "epoch": 0.9300756207450926, "grad_norm": 24.259025689627062, "learning_rate": 7.406804516600862e-09, "logits/chosen": -2.828125, "logits/rejected": -3.046875, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.2095, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 4.9375, "rewards/rejected": -10.0, "step": 23430 }, { "epoch": 0.9304725800369172, "grad_norm": 26.803236573821128, "learning_rate": 7.323334884120485e-09, "logits/chosen": -2.703125, "logits/rejected": -2.6875, "logps/chosen": -676.0, "logps/rejected": -1208.0, "loss": 0.1706, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.125, "rewards/margins": 5.1875, "rewards/rejected": -10.3125, "step": 23440 }, { "epoch": 0.9308695393287418, "grad_norm": 34.73043344570691, "learning_rate": 7.240331252094556e-09, "logits/chosen": -2.75, "logits/rejected": -3.03125, "logps/chosen": -660.0, "logps/rejected": -1168.0, "loss": 0.1966, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.0, "rewards/rejected": -9.9375, "step": 23450 }, { "epoch": 0.9312664986205664, "grad_norm": 22.13877180574277, "learning_rate": 7.157793779910964e-09, "logits/chosen": -2.71875, "logits/rejected": -2.953125, "logps/chosen": -652.0, "logps/rejected": -1176.0, "loss": 0.1783, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 23460 }, { "epoch": 0.9316634579123911, "grad_norm": 29.991314664437912, "learning_rate": 7.075722626062541e-09, "logits/chosen": -2.8125, "logits/rejected": -2.984375, "logps/chosen": -656.0, "logps/rejected": -1184.0, "loss": 0.2132, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.46875, "rewards/rejected": -10.375, "step": 23470 }, { "epoch": 0.9320604172042157, "grad_norm": 24.61936268407158, "learning_rate": 6.994117948146638e-09, "logits/chosen": -2.703125, "logits/rejected": -2.9375, "logps/chosen": -636.0, "logps/rejected": -1200.0, "loss": 0.2096, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.75, "rewards/margins": 5.6875, "rewards/rejected": -10.4375, "step": 23480 }, { "epoch": 0.9324573764960403, "grad_norm": 28.149965452716042, "learning_rate": 6.912979902864769e-09, "logits/chosen": -2.75, "logits/rejected": -2.859375, "logps/chosen": -676.0, "logps/rejected": -1200.0, "loss": 0.2295, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 23490 }, { "epoch": 0.9328543357878649, "grad_norm": 25.086924839512687, "learning_rate": 6.832308646022522e-09, "logits/chosen": -2.796875, "logits/rejected": -3.046875, "logps/chosen": -660.0, "logps/rejected": -1192.0, "loss": 0.1672, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.9375, "rewards/margins": 5.3125, "rewards/rejected": -10.25, "step": 23500 }, { "epoch": 0.9332512950796896, "grad_norm": 26.783113361486986, "learning_rate": 6.7521043325290124e-09, "logits/chosen": -2.78125, "logits/rejected": -3.28125, "logps/chosen": -688.0, "logps/rejected": -1176.0, "loss": 0.1906, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0625, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 23510 }, { "epoch": 0.9336482543715142, "grad_norm": 29.792928232002154, "learning_rate": 6.672367116396793e-09, "logits/chosen": -2.6875, "logits/rejected": -2.953125, "logps/chosen": -688.0, "logps/rejected": -1224.0, "loss": 0.2192, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.96875, "rewards/rejected": -10.125, "step": 23520 }, { "epoch": 0.9340452136633388, "grad_norm": 24.109627603720167, "learning_rate": 6.593097150741495e-09, "logits/chosen": -2.734375, "logits/rejected": -3.109375, "logps/chosen": -652.0, "logps/rejected": -1192.0, "loss": 0.1974, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.4375, "rewards/rejected": -10.25, "step": 23530 }, { "epoch": 0.9344421729551634, "grad_norm": 39.188475350130744, "learning_rate": 6.51429458778141e-09, "logits/chosen": -2.609375, "logits/rejected": -3.109375, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2157, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.25, "rewards/rejected": -10.1875, "step": 23540 }, { "epoch": 0.9348391322469881, "grad_norm": 29.20266832623092, "learning_rate": 6.43595957883733e-09, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -688.0, "logps/rejected": -1184.0, "loss": 0.1773, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.0625, "rewards/rejected": -10.125, "step": 23550 }, { "epoch": 0.9352360915388127, "grad_norm": 22.91835294457946, "learning_rate": 6.358092274332289e-09, "logits/chosen": -2.71875, "logits/rejected": -2.953125, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.2223, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 5.1875, "rewards/rejected": -10.375, "step": 23560 }, { "epoch": 0.9356330508306373, "grad_norm": 33.498977740812705, "learning_rate": 6.280692823791178e-09, "logits/chosen": -2.6875, "logits/rejected": -2.875, "logps/chosen": -672.0, "logps/rejected": -1176.0, "loss": 0.2089, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.875, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 23570 }, { "epoch": 0.9360300101224619, "grad_norm": 34.83652521715825, "learning_rate": 6.203761375840472e-09, "logits/chosen": -2.703125, "logits/rejected": -2.828125, "logps/chosen": -652.0, "logps/rejected": -1144.0, "loss": 0.2035, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.0, "rewards/rejected": -9.8125, "step": 23580 }, { "epoch": 0.9364269694142866, "grad_norm": 30.285319348228743, "learning_rate": 6.127298078207943e-09, "logits/chosen": -2.6875, "logits/rejected": -2.875, "logps/chosen": -708.0, "logps/rejected": -1176.0, "loss": 0.225, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.09375, "rewards/margins": 4.875, "rewards/rejected": -10.0, "step": 23590 }, { "epoch": 0.9368239287061112, "grad_norm": 21.59928536189618, "learning_rate": 6.051303077722391e-09, "logits/chosen": -2.703125, "logits/rejected": -2.796875, "logps/chosen": -684.0, "logps/rejected": -1224.0, "loss": 0.1765, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.53125, "rewards/rejected": -10.5625, "step": 23600 }, { "epoch": 0.9372208879979358, "grad_norm": 27.3919847131767, "learning_rate": 5.975776520313475e-09, "logits/chosen": -2.765625, "logits/rejected": -3.15625, "logps/chosen": -648.0, "logps/rejected": -1144.0, "loss": 0.1926, "rewards/accuracies": 0.96875, "rewards/chosen": -4.78125, "rewards/margins": 5.21875, "rewards/rejected": -10.0, "step": 23610 }, { "epoch": 0.9376178472897604, "grad_norm": 18.592317315831288, "learning_rate": 5.9007185510112356e-09, "logits/chosen": -2.6875, "logits/rejected": -2.875, "logps/chosen": -696.0, "logps/rejected": -1264.0, "loss": 0.2028, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.53125, "rewards/rejected": -10.5625, "step": 23620 }, { "epoch": 0.9380148065815851, "grad_norm": 33.35446551181669, "learning_rate": 5.82612931394591e-09, "logits/chosen": -2.671875, "logits/rejected": -3.03125, "logps/chosen": -676.0, "logps/rejected": -1256.0, "loss": 0.1898, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.96875, "rewards/rejected": -10.875, "step": 23630 }, { "epoch": 0.9384117658734097, "grad_norm": 39.74139007332182, "learning_rate": 5.752008952347648e-09, "logits/chosen": -2.625, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1136.0, "loss": 0.2153, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.125, "rewards/rejected": -9.9375, "step": 23640 }, { "epoch": 0.9388087251652343, "grad_norm": 18.60749030725242, "learning_rate": 5.678357608546236e-09, "logits/chosen": -2.65625, "logits/rejected": -2.765625, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.2153, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.875, "rewards/margins": 5.125, "rewards/rejected": -10.0, "step": 23650 }, { "epoch": 0.9392056844570589, "grad_norm": 32.12481737985574, "learning_rate": 5.6051754239709034e-09, "logits/chosen": -2.71875, "logits/rejected": -2.734375, "logps/chosen": -708.0, "logps/rejected": -1208.0, "loss": 0.2268, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.28125, "rewards/margins": 5.15625, "rewards/rejected": -10.4375, "step": 23660 }, { "epoch": 0.9396026437488836, "grad_norm": 31.57678557825919, "learning_rate": 5.532462539149852e-09, "logits/chosen": -2.78125, "logits/rejected": -2.953125, "logps/chosen": -676.0, "logps/rejected": -1176.0, "loss": 0.2135, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0625, "rewards/margins": 5.0, "rewards/rejected": -10.0625, "step": 23670 }, { "epoch": 0.9399996030407082, "grad_norm": 27.11329008363804, "learning_rate": 5.46021909371025e-09, "logits/chosen": -2.8125, "logits/rejected": -3.015625, "logps/chosen": -656.0, "logps/rejected": -1208.0, "loss": 0.197, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.40625, "rewards/rejected": -10.375, "step": 23680 }, { "epoch": 0.9403965623325328, "grad_norm": 30.895356726510336, "learning_rate": 5.388445226377713e-09, "logits/chosen": -2.71875, "logits/rejected": -2.828125, "logps/chosen": -680.0, "logps/rejected": -1160.0, "loss": 0.173, "rewards/accuracies": 0.96875, "rewards/chosen": -5.15625, "rewards/margins": 4.65625, "rewards/rejected": -9.8125, "step": 23690 }, { "epoch": 0.9407935216243575, "grad_norm": 41.452357735831875, "learning_rate": 5.317141074976134e-09, "logits/chosen": -2.65625, "logits/rejected": -2.78125, "logps/chosen": -676.0, "logps/rejected": -1168.0, "loss": 0.218, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.09375, "rewards/margins": 5.0, "rewards/rejected": -10.125, "step": 23700 }, { "epoch": 0.9411904809161821, "grad_norm": 26.929095601315105, "learning_rate": 5.246306776427539e-09, "logits/chosen": -2.765625, "logits/rejected": -2.859375, "logps/chosen": -656.0, "logps/rejected": -1176.0, "loss": 0.2117, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 5.125, "rewards/rejected": -10.0625, "step": 23710 }, { "epoch": 0.9415874402080067, "grad_norm": 28.66164947093596, "learning_rate": 5.175942466751654e-09, "logits/chosen": -2.671875, "logits/rejected": -2.796875, "logps/chosen": -704.0, "logps/rejected": -1208.0, "loss": 0.1959, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 5.0625, "rewards/rejected": -10.1875, "step": 23720 }, { "epoch": 0.9419843994998313, "grad_norm": 31.212335333598492, "learning_rate": 5.106048281065728e-09, "logits/chosen": -2.625, "logits/rejected": -3.046875, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.2055, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.71875, "rewards/margins": 5.375, "rewards/rejected": -10.125, "step": 23730 }, { "epoch": 0.942381358791656, "grad_norm": 44.87706002419234, "learning_rate": 5.036624353584179e-09, "logits/chosen": -2.640625, "logits/rejected": -3.046875, "logps/chosen": -664.0, "logps/rejected": -1208.0, "loss": 0.193, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.625, "rewards/rejected": -10.4375, "step": 23740 }, { "epoch": 0.9427783180834806, "grad_norm": 24.978398782930913, "learning_rate": 4.9676708176185636e-09, "logits/chosen": -2.75, "logits/rejected": -3.203125, "logps/chosen": -660.0, "logps/rejected": -1176.0, "loss": 0.2117, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 23750 }, { "epoch": 0.9431752773753052, "grad_norm": 24.337323980112604, "learning_rate": 4.8991878055770205e-09, "logits/chosen": -2.734375, "logits/rejected": -2.78125, "logps/chosen": -692.0, "logps/rejected": -1256.0, "loss": 0.2105, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 5.625, "rewards/rejected": -10.75, "step": 23760 }, { "epoch": 0.9435722366671297, "grad_norm": 33.05573128132507, "learning_rate": 4.83117544896422e-09, "logits/chosen": -2.671875, "logits/rejected": -2.796875, "logps/chosen": -696.0, "logps/rejected": -1192.0, "loss": 0.2409, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 5.03125, "rewards/rejected": -10.1875, "step": 23770 }, { "epoch": 0.9439691959589545, "grad_norm": 26.201548861359978, "learning_rate": 4.763633878381051e-09, "logits/chosen": -2.78125, "logits/rejected": -3.046875, "logps/chosen": -652.0, "logps/rejected": -1192.0, "loss": 0.1932, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.5, "rewards/rejected": -10.3125, "step": 23780 }, { "epoch": 0.944366155250779, "grad_norm": 20.70736286004038, "learning_rate": 4.696563223524408e-09, "logits/chosen": -2.796875, "logits/rejected": -2.9375, "logps/chosen": -684.0, "logps/rejected": -1192.0, "loss": 0.1849, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.21875, "rewards/margins": 4.96875, "rewards/rejected": -10.1875, "step": 23790 }, { "epoch": 0.9447631145426036, "grad_norm": 21.046402332177298, "learning_rate": 4.6299636131868495e-09, "logits/chosen": -2.609375, "logits/rejected": -2.84375, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2115, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.28125, "rewards/rejected": -10.1875, "step": 23800 }, { "epoch": 0.9451600738344282, "grad_norm": 34.767082118707656, "learning_rate": 4.563835175256464e-09, "logits/chosen": -2.734375, "logits/rejected": -3.09375, "logps/chosen": -700.0, "logps/rejected": -1192.0, "loss": 0.2001, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.125, "rewards/margins": 5.0625, "rewards/rejected": -10.1875, "step": 23810 }, { "epoch": 0.9455570331262529, "grad_norm": 34.84789408658533, "learning_rate": 4.49817803671651e-09, "logits/chosen": -2.75, "logits/rejected": -2.890625, "logps/chosen": -684.0, "logps/rejected": -1224.0, "loss": 0.2081, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 5.5625, "rewards/rejected": -10.5625, "step": 23820 }, { "epoch": 0.9459539924180775, "grad_norm": 25.34742877950978, "learning_rate": 4.432992323645302e-09, "logits/chosen": -2.75, "logits/rejected": -3.046875, "logps/chosen": -708.0, "logps/rejected": -1232.0, "loss": 0.2208, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.28125, "rewards/margins": 5.375, "rewards/rejected": -10.625, "step": 23830 }, { "epoch": 0.9463509517099021, "grad_norm": 22.54280123198656, "learning_rate": 4.368278161215821e-09, "logits/chosen": -2.765625, "logits/rejected": -2.984375, "logps/chosen": -656.0, "logps/rejected": -1200.0, "loss": 0.1864, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.75, "rewards/margins": 5.40625, "rewards/rejected": -10.125, "step": 23840 }, { "epoch": 0.9467479110017267, "grad_norm": 31.176244386208136, "learning_rate": 4.304035673695666e-09, "logits/chosen": -2.6875, "logits/rejected": -2.796875, "logps/chosen": -676.0, "logps/rejected": -1200.0, "loss": 0.198, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.09375, "rewards/margins": 5.09375, "rewards/rejected": -10.1875, "step": 23850 }, { "epoch": 0.9471448702935514, "grad_norm": 30.831500737142, "learning_rate": 4.240264984446573e-09, "logits/chosen": -2.734375, "logits/rejected": -2.796875, "logps/chosen": -664.0, "logps/rejected": -1208.0, "loss": 0.2041, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.0625, "rewards/rejected": -10.0625, "step": 23860 }, { "epoch": 0.947541829585376, "grad_norm": 24.697161368836547, "learning_rate": 4.176966215924338e-09, "logits/chosen": -2.578125, "logits/rejected": -2.75, "logps/chosen": -676.0, "logps/rejected": -1224.0, "loss": 0.1881, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.90625, "rewards/margins": 5.5625, "rewards/rejected": -10.5, "step": 23870 }, { "epoch": 0.9479387888772006, "grad_norm": 33.93710183982928, "learning_rate": 4.114139489678647e-09, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -692.0, "logps/rejected": -1216.0, "loss": 0.2118, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 5.3125, "rewards/rejected": -10.375, "step": 23880 }, { "epoch": 0.9483357481690252, "grad_norm": 15.02587053375472, "learning_rate": 4.051784926352636e-09, "logits/chosen": -2.625, "logits/rejected": -2.984375, "logps/chosen": -668.0, "logps/rejected": -1216.0, "loss": 0.2091, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.8125, "rewards/margins": 5.5625, "rewards/rejected": -10.375, "step": 23890 }, { "epoch": 0.9487327074608499, "grad_norm": 18.19587455714511, "learning_rate": 3.989902645682802e-09, "logits/chosen": -2.75, "logits/rejected": -2.96875, "logps/chosen": -664.0, "logps/rejected": -1200.0, "loss": 0.1673, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.40625, "rewards/rejected": -10.25, "step": 23900 }, { "epoch": 0.9491296667526745, "grad_norm": 25.42377925161629, "learning_rate": 3.928492766498759e-09, "logits/chosen": -2.578125, "logits/rejected": -2.796875, "logps/chosen": -684.0, "logps/rejected": -1160.0, "loss": 0.2247, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 4.96875, "rewards/rejected": -9.875, "step": 23910 }, { "epoch": 0.9495266260444991, "grad_norm": 51.237003376752554, "learning_rate": 3.867555406722955e-09, "logits/chosen": -2.671875, "logits/rejected": -2.984375, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.2237, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 5.1875, "rewards/rejected": -10.375, "step": 23920 }, { "epoch": 0.9499235853363237, "grad_norm": 32.40145355553126, "learning_rate": 3.807090683370512e-09, "logits/chosen": -2.609375, "logits/rejected": -2.71875, "logps/chosen": -668.0, "logps/rejected": -1200.0, "loss": 0.2072, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.84375, "rewards/margins": 5.375, "rewards/rejected": -10.1875, "step": 23930 }, { "epoch": 0.9503205446281484, "grad_norm": 23.249444778760388, "learning_rate": 3.747098712548968e-09, "logits/chosen": -2.75, "logits/rejected": -2.984375, "logps/chosen": -652.0, "logps/rejected": -1176.0, "loss": 0.1817, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 5.1875, "rewards/rejected": -10.0625, "step": 23940 }, { "epoch": 0.950717503919973, "grad_norm": 34.87392693114453, "learning_rate": 3.6875796094580634e-09, "logits/chosen": -2.75, "logits/rejected": -3.171875, "logps/chosen": -660.0, "logps/rejected": -1192.0, "loss": 0.2324, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.5625, "rewards/rejected": -10.3125, "step": 23950 }, { "epoch": 0.9511144632117976, "grad_norm": 25.565774592262258, "learning_rate": 3.6285334883895126e-09, "logits/chosen": -2.8125, "logits/rejected": -3.0625, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2168, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.09375, "rewards/margins": 5.1875, "rewards/rejected": -10.3125, "step": 23960 }, { "epoch": 0.9515114225036223, "grad_norm": 30.11955668689475, "learning_rate": 3.5699604627266734e-09, "logits/chosen": -2.796875, "logits/rejected": -3.03125, "logps/chosen": -636.0, "logps/rejected": -1160.0, "loss": 0.1912, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 5.1875, "rewards/rejected": -9.875, "step": 23970 }, { "epoch": 0.9519083817954469, "grad_norm": 23.303075309095274, "learning_rate": 3.5118606449446585e-09, "logits/chosen": -2.78125, "logits/rejected": -3.15625, "logps/chosen": -664.0, "logps/rejected": -1160.0, "loss": 0.1974, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 23980 }, { "epoch": 0.9523053410872715, "grad_norm": 31.407725965624692, "learning_rate": 3.4542341466097246e-09, "logits/chosen": -2.703125, "logits/rejected": -3.21875, "logps/chosen": -664.0, "logps/rejected": -1120.0, "loss": 0.2249, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 4.6875, "rewards/rejected": -9.6875, "step": 23990 }, { "epoch": 0.9527023003790961, "grad_norm": 34.67153025583115, "learning_rate": 3.397081078379327e-09, "logits/chosen": -2.734375, "logits/rejected": -2.90625, "logps/chosen": -680.0, "logps/rejected": -1144.0, "loss": 0.2016, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 4.8125, "rewards/rejected": -9.8125, "step": 24000 }, { "epoch": 0.9527023003790961, "eval_logits/chosen": -2.703125, "eval_logits/rejected": -2.9375, "eval_logps/chosen": -712.0, "eval_logps/rejected": -1120.0, "eval_loss": 0.25219956040382385, "eval_rewards/accuracies": 0.894565224647522, "eval_rewards/chosen": -5.28125, "eval_rewards/margins": 4.28125, "eval_rewards/rejected": -9.5625, "eval_runtime": 5407.5902, "eval_samples_per_second": 32.665, "eval_steps_per_second": 0.51, "step": 24000 }, { "epoch": 0.9530992596709208, "grad_norm": 41.04314699406191, "learning_rate": 3.3404015500017592e-09, "logits/chosen": -2.640625, "logits/rejected": -2.9375, "logps/chosen": -636.0, "logps/rejected": -1184.0, "loss": 0.2326, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.625, "rewards/margins": 5.5, "rewards/rejected": -10.125, "step": 24010 }, { "epoch": 0.9534962189627454, "grad_norm": 17.774890053442764, "learning_rate": 3.284195670315987e-09, "logits/chosen": -2.65625, "logits/rejected": -2.640625, "logps/chosen": -704.0, "logps/rejected": -1256.0, "loss": 0.1884, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.21875, "rewards/margins": 5.46875, "rewards/rejected": -10.6875, "step": 24020 }, { "epoch": 0.95389317825457, "grad_norm": 21.211583488325495, "learning_rate": 3.2284635472515643e-09, "logits/chosen": -2.625, "logits/rejected": -2.890625, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.1949, "rewards/accuracies": 0.96875, "rewards/chosen": -4.96875, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 24030 }, { "epoch": 0.9542901375463946, "grad_norm": 25.02268063331469, "learning_rate": 3.17320528782819e-09, "logits/chosen": -2.78125, "logits/rejected": -3.203125, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.1698, "rewards/accuracies": 0.96875, "rewards/chosen": -5.03125, "rewards/margins": 5.25, "rewards/rejected": -10.3125, "step": 24040 }, { "epoch": 0.9546870968382193, "grad_norm": 41.313684291430754, "learning_rate": 3.1184209981556796e-09, "logits/chosen": -2.6875, "logits/rejected": -2.90625, "logps/chosen": -680.0, "logps/rejected": -1184.0, "loss": 0.2175, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.09375, "rewards/rejected": -10.0625, "step": 24050 }, { "epoch": 0.9550840561300439, "grad_norm": 25.913666853988115, "learning_rate": 3.0641107834337155e-09, "logits/chosen": -2.71875, "logits/rejected": -2.921875, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2505, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 24060 }, { "epoch": 0.9554810154218685, "grad_norm": 16.919538154734006, "learning_rate": 3.0102747479515965e-09, "logits/chosen": -2.734375, "logits/rejected": -2.875, "logps/chosen": -680.0, "logps/rejected": -1144.0, "loss": 0.2359, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 4.84375, "rewards/rejected": -9.9375, "step": 24070 }, { "epoch": 0.9558779747136931, "grad_norm": 31.819923307545572, "learning_rate": 2.9569129950880733e-09, "logits/chosen": -2.734375, "logits/rejected": -2.921875, "logps/chosen": -672.0, "logps/rejected": -1192.0, "loss": 0.1929, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 24080 }, { "epoch": 0.9562749340055178, "grad_norm": 20.821373387758385, "learning_rate": 2.904025627311235e-09, "logits/chosen": -2.671875, "logits/rejected": -2.96875, "logps/chosen": -688.0, "logps/rejected": -1200.0, "loss": 0.197, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.125, "rewards/margins": 5.3125, "rewards/rejected": -10.4375, "step": 24090 }, { "epoch": 0.9566718932973424, "grad_norm": 38.53776489693109, "learning_rate": 2.851612746178178e-09, "logits/chosen": -2.75, "logits/rejected": -3.078125, "logps/chosen": -640.0, "logps/rejected": -1168.0, "loss": 0.2179, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.6875, "rewards/margins": 5.21875, "rewards/rejected": -9.9375, "step": 24100 }, { "epoch": 0.957068852589167, "grad_norm": 27.993855862031403, "learning_rate": 2.7996744523348102e-09, "logits/chosen": -2.59375, "logits/rejected": -2.96875, "logps/chosen": -660.0, "logps/rejected": -1184.0, "loss": 0.2062, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.8125, "rewards/margins": 5.5, "rewards/rejected": -10.3125, "step": 24110 }, { "epoch": 0.9574658118809916, "grad_norm": 34.48492211142527, "learning_rate": 2.7482108455158248e-09, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -696.0, "logps/rejected": -1216.0, "loss": 0.2174, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.125, "rewards/margins": 5.25, "rewards/rejected": -10.375, "step": 24120 }, { "epoch": 0.9578627711728163, "grad_norm": 18.623121162044267, "learning_rate": 2.6972220245442824e-09, "logits/chosen": -2.6875, "logits/rejected": -2.859375, "logps/chosen": -656.0, "logps/rejected": -1200.0, "loss": 0.1472, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.84375, "rewards/margins": 5.53125, "rewards/rejected": -10.375, "step": 24130 }, { "epoch": 0.9582597304646409, "grad_norm": 29.558297547750126, "learning_rate": 2.6467080873316392e-09, "logits/chosen": -2.71875, "logits/rejected": -2.65625, "logps/chosen": -700.0, "logps/rejected": -1216.0, "loss": 0.2182, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.21875, "rewards/margins": 4.96875, "rewards/rejected": -10.1875, "step": 24140 }, { "epoch": 0.9586566897564655, "grad_norm": 12.521861548533957, "learning_rate": 2.596669130877388e-09, "logits/chosen": -2.609375, "logits/rejected": -2.953125, "logps/chosen": -640.0, "logps/rejected": -1184.0, "loss": 0.1948, "rewards/accuracies": 0.96875, "rewards/chosen": -4.71875, "rewards/margins": 5.375, "rewards/rejected": -10.125, "step": 24150 }, { "epoch": 0.95905364904829, "grad_norm": 37.955029438815686, "learning_rate": 2.547105251268944e-09, "logits/chosen": -2.609375, "logits/rejected": -2.953125, "logps/chosen": -676.0, "logps/rejected": -1208.0, "loss": 0.2369, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.40625, "rewards/rejected": -10.375, "step": 24160 }, { "epoch": 0.9594506083401148, "grad_norm": 23.04136391522569, "learning_rate": 2.4980165436814806e-09, "logits/chosen": -2.640625, "logits/rejected": -2.984375, "logps/chosen": -684.0, "logps/rejected": -1168.0, "loss": 0.2281, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 5.03125, "rewards/rejected": -9.9375, "step": 24170 }, { "epoch": 0.9598475676319393, "grad_norm": 21.613820919672396, "learning_rate": 2.4494031023777063e-09, "logits/chosen": -2.765625, "logits/rejected": -2.953125, "logps/chosen": -704.0, "logps/rejected": -1216.0, "loss": 0.1719, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 5.4375, "rewards/rejected": -10.5, "step": 24180 }, { "epoch": 0.9602445269237639, "grad_norm": 25.409026882932498, "learning_rate": 2.401265020707699e-09, "logits/chosen": -2.78125, "logits/rejected": -3.046875, "logps/chosen": -676.0, "logps/rejected": -1208.0, "loss": 0.2076, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.125, "rewards/margins": 5.46875, "rewards/rejected": -10.5625, "step": 24190 }, { "epoch": 0.9606414862155885, "grad_norm": 35.486481867550204, "learning_rate": 2.3536023911087387e-09, "logits/chosen": -2.78125, "logits/rejected": -3.0625, "logps/chosen": -664.0, "logps/rejected": -1168.0, "loss": 0.2229, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0625, "rewards/margins": 5.0, "rewards/rejected": -10.0625, "step": 24200 }, { "epoch": 0.9610384455074132, "grad_norm": 23.755019743793895, "learning_rate": 2.306415305105058e-09, "logits/chosen": -2.703125, "logits/rejected": -2.890625, "logps/chosen": -652.0, "logps/rejected": -1208.0, "loss": 0.2013, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.8125, "rewards/margins": 5.5625, "rewards/rejected": -10.375, "step": 24210 }, { "epoch": 0.9614354047992378, "grad_norm": 29.20990537493375, "learning_rate": 2.259703853307815e-09, "logits/chosen": -2.515625, "logits/rejected": -2.96875, "logps/chosen": -656.0, "logps/rejected": -1208.0, "loss": 0.1828, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.71875, "rewards/margins": 5.5, "rewards/rejected": -10.1875, "step": 24220 }, { "epoch": 0.9618323640910624, "grad_norm": 28.798182444948875, "learning_rate": 2.2134681254148137e-09, "logits/chosen": -2.703125, "logits/rejected": -2.859375, "logps/chosen": -700.0, "logps/rejected": -1216.0, "loss": 0.1837, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.21875, "rewards/margins": 5.21875, "rewards/rejected": -10.4375, "step": 24230 }, { "epoch": 0.9622293233828871, "grad_norm": 13.90398231764623, "learning_rate": 2.1677082102102563e-09, "logits/chosen": -2.609375, "logits/rejected": -2.734375, "logps/chosen": -644.0, "logps/rejected": -1200.0, "loss": 0.2083, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.5625, "rewards/rejected": -10.3125, "step": 24240 }, { "epoch": 0.9626262826747117, "grad_norm": 21.99590845676493, "learning_rate": 2.1224241955647426e-09, "logits/chosen": -2.71875, "logits/rejected": -2.828125, "logps/chosen": -656.0, "logps/rejected": -1216.0, "loss": 0.2086, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 24250 }, { "epoch": 0.9630232419665363, "grad_norm": 34.74111010216891, "learning_rate": 2.077616168435076e-09, "logits/chosen": -2.625, "logits/rejected": -3.234375, "logps/chosen": -668.0, "logps/rejected": -1192.0, "loss": 0.1998, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.875, "rewards/margins": 5.53125, "rewards/rejected": -10.375, "step": 24260 }, { "epoch": 0.9634202012583609, "grad_norm": 30.322654367919935, "learning_rate": 2.0332842148639285e-09, "logits/chosen": -2.609375, "logits/rejected": -2.90625, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.219, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 5.0, "rewards/rejected": -10.125, "step": 24270 }, { "epoch": 0.9638171605501856, "grad_norm": 23.04628428285642, "learning_rate": 1.989428419979844e-09, "logits/chosen": -2.578125, "logits/rejected": -2.703125, "logps/chosen": -656.0, "logps/rejected": -1208.0, "loss": 0.2058, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.5625, "rewards/rejected": -10.4375, "step": 24280 }, { "epoch": 0.9642141198420102, "grad_norm": 25.315684032081816, "learning_rate": 1.9460488679970132e-09, "logits/chosen": -2.609375, "logits/rejected": -2.828125, "logps/chosen": -692.0, "logps/rejected": -1184.0, "loss": 0.2178, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 4.9375, "rewards/rejected": -10.125, "step": 24290 }, { "epoch": 0.9646110791338348, "grad_norm": 37.64494978771232, "learning_rate": 1.903145642215137e-09, "logits/chosen": -2.75, "logits/rejected": -2.984375, "logps/chosen": -684.0, "logps/rejected": -1192.0, "loss": 0.1981, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.0, "rewards/margins": 5.25, "rewards/rejected": -10.25, "step": 24300 }, { "epoch": 0.9650080384256594, "grad_norm": 26.023569832531656, "learning_rate": 1.8607188250192584e-09, "logits/chosen": -2.609375, "logits/rejected": -2.8125, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.1928, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 24310 }, { "epoch": 0.9654049977174841, "grad_norm": 23.27411676288734, "learning_rate": 1.8187684978795414e-09, "logits/chosen": -2.75, "logits/rejected": -2.8125, "logps/chosen": -684.0, "logps/rejected": -1192.0, "loss": 0.2018, "rewards/accuracies": 0.96875, "rewards/chosen": -5.21875, "rewards/margins": 5.09375, "rewards/rejected": -10.3125, "step": 24320 }, { "epoch": 0.9658019570093087, "grad_norm": 28.839539402255014, "learning_rate": 1.7772947413512152e-09, "logits/chosen": -2.75, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1120.0, "loss": 0.1939, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.0, "rewards/margins": 4.6875, "rewards/rejected": -9.6875, "step": 24330 }, { "epoch": 0.9661989163011333, "grad_norm": 24.44169892148134, "learning_rate": 1.7362976350743797e-09, "logits/chosen": -2.78125, "logits/rejected": -2.828125, "logps/chosen": -660.0, "logps/rejected": -1280.0, "loss": 0.2481, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.96875, "rewards/margins": 5.9375, "rewards/rejected": -10.9375, "step": 24340 }, { "epoch": 0.9665958755929579, "grad_norm": 27.09417093687331, "learning_rate": 1.695777257773784e-09, "logits/chosen": -2.78125, "logits/rejected": -3.0625, "logps/chosen": -684.0, "logps/rejected": -1200.0, "loss": 0.2085, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 5.15625, "rewards/rejected": -10.3125, "step": 24350 }, { "epoch": 0.9669928348847826, "grad_norm": 30.57836302126762, "learning_rate": 1.6557336872588257e-09, "logits/chosen": -2.765625, "logits/rejected": -3.046875, "logps/chosen": -656.0, "logps/rejected": -1160.0, "loss": 0.2006, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.0625, "rewards/rejected": -9.9375, "step": 24360 }, { "epoch": 0.9673897941766072, "grad_norm": 37.04412466612038, "learning_rate": 1.6161670004232463e-09, "logits/chosen": -2.5625, "logits/rejected": -2.796875, "logps/chosen": -692.0, "logps/rejected": -1208.0, "loss": 0.228, "rewards/accuracies": 0.96875, "rewards/chosen": -5.09375, "rewards/margins": 5.28125, "rewards/rejected": -10.375, "step": 24370 }, { "epoch": 0.9677867534684318, "grad_norm": 35.159442845978376, "learning_rate": 1.5770772732451033e-09, "logits/chosen": -2.65625, "logits/rejected": -2.9375, "logps/chosen": -700.0, "logps/rejected": -1184.0, "loss": 0.2114, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.1875, "rewards/margins": 4.90625, "rewards/rejected": -10.125, "step": 24380 }, { "epoch": 0.9681837127602564, "grad_norm": 35.38830300416721, "learning_rate": 1.5384645807864637e-09, "logits/chosen": -2.734375, "logits/rejected": -2.828125, "logps/chosen": -684.0, "logps/rejected": -1184.0, "loss": 0.2593, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.1875, "rewards/rejected": -10.125, "step": 24390 }, { "epoch": 0.9685806720520811, "grad_norm": 33.55705868218884, "learning_rate": 1.5003289971935174e-09, "logits/chosen": -2.84375, "logits/rejected": -2.9375, "logps/chosen": -656.0, "logps/rejected": -1192.0, "loss": 0.2374, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.90625, "rewards/margins": 5.34375, "rewards/rejected": -10.25, "step": 24400 }, { "epoch": 0.9689776313439057, "grad_norm": 37.42230545212364, "learning_rate": 1.462670595696186e-09, "logits/chosen": -2.875, "logits/rejected": -3.1875, "logps/chosen": -652.0, "logps/rejected": -1200.0, "loss": 0.1906, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 5.40625, "rewards/rejected": -10.3125, "step": 24410 }, { "epoch": 0.9693745906357303, "grad_norm": 27.13912442373595, "learning_rate": 1.4254894486081247e-09, "logits/chosen": -2.703125, "logits/rejected": -3.15625, "logps/chosen": -688.0, "logps/rejected": -1208.0, "loss": 0.1841, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.03125, "rewards/margins": 5.46875, "rewards/rejected": -10.5, "step": 24420 }, { "epoch": 0.9697715499275549, "grad_norm": 32.66622081901645, "learning_rate": 1.388785627326472e-09, "logits/chosen": -2.71875, "logits/rejected": -2.984375, "logps/chosen": -684.0, "logps/rejected": -1184.0, "loss": 0.1999, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0, "rewards/margins": 5.0625, "rewards/rejected": -10.125, "step": 24430 }, { "epoch": 0.9701685092193796, "grad_norm": 25.34426696191669, "learning_rate": 1.3525592023318766e-09, "logits/chosen": -2.71875, "logits/rejected": -2.921875, "logps/chosen": -664.0, "logps/rejected": -1176.0, "loss": 0.2116, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0, "rewards/margins": 5.15625, "rewards/rejected": -10.125, "step": 24440 }, { "epoch": 0.9705654685112042, "grad_norm": 24.703219392962534, "learning_rate": 1.316810243188221e-09, "logits/chosen": -2.75, "logits/rejected": -2.859375, "logps/chosen": -688.0, "logps/rejected": -1144.0, "loss": 0.2269, "rewards/accuracies": 0.9375, "rewards/chosen": -5.15625, "rewards/margins": 4.65625, "rewards/rejected": -9.8125, "step": 24450 }, { "epoch": 0.9709624278030288, "grad_norm": 26.151059494853065, "learning_rate": 1.2815388185425103e-09, "logits/chosen": -2.640625, "logits/rejected": -2.953125, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.2466, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.0625, "rewards/rejected": -10.0625, "step": 24460 }, { "epoch": 0.9713593870948535, "grad_norm": 23.553590063674566, "learning_rate": 1.2467449961247878e-09, "logits/chosen": -2.765625, "logits/rejected": -2.859375, "logps/chosen": -688.0, "logps/rejected": -1168.0, "loss": 0.2062, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.125, "rewards/margins": 4.96875, "rewards/rejected": -10.0625, "step": 24470 }, { "epoch": 0.9717563463866781, "grad_norm": 26.15906835392964, "learning_rate": 1.2124288427479423e-09, "logits/chosen": -2.609375, "logits/rejected": -2.734375, "logps/chosen": -680.0, "logps/rejected": -1216.0, "loss": 0.1985, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.40625, "rewards/rejected": -10.3125, "step": 24480 }, { "epoch": 0.9721533056785027, "grad_norm": 34.18434339238905, "learning_rate": 1.1785904243077072e-09, "logits/chosen": -2.75, "logits/rejected": -3.046875, "logps/chosen": -656.0, "logps/rejected": -1176.0, "loss": 0.1948, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.75, "rewards/margins": 5.375, "rewards/rejected": -10.125, "step": 24490 }, { "epoch": 0.9725502649703273, "grad_norm": 32.656075439646386, "learning_rate": 1.1452298057823551e-09, "logits/chosen": -2.734375, "logits/rejected": -2.921875, "logps/chosen": -660.0, "logps/rejected": -1200.0, "loss": 0.1947, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 5.375, "rewards/rejected": -10.375, "step": 24500 }, { "epoch": 0.972947224262152, "grad_norm": 30.995260605864186, "learning_rate": 1.112347051232726e-09, "logits/chosen": -2.625, "logits/rejected": -2.734375, "logps/chosen": -676.0, "logps/rejected": -1192.0, "loss": 0.2293, "rewards/accuracies": 0.9375, "rewards/chosen": -4.96875, "rewards/margins": 5.25, "rewards/rejected": -10.1875, "step": 24510 }, { "epoch": 0.9733441835539766, "grad_norm": 46.48551677608805, "learning_rate": 1.0799422238019773e-09, "logits/chosen": -2.75, "logits/rejected": -2.65625, "logps/chosen": -732.0, "logps/rejected": -1280.0, "loss": 0.221, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.625, "rewards/margins": 5.25, "rewards/rejected": -10.875, "step": 24520 }, { "epoch": 0.9737411428458012, "grad_norm": 15.032529131377864, "learning_rate": 1.0480153857155283e-09, "logits/chosen": -2.734375, "logits/rejected": -2.796875, "logps/chosen": -680.0, "logps/rejected": -1192.0, "loss": 0.1985, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.9375, "rewards/margins": 5.21875, "rewards/rejected": -10.1875, "step": 24530 }, { "epoch": 0.9741381021376257, "grad_norm": 41.03291134955497, "learning_rate": 1.01656659828106e-09, "logits/chosen": -2.578125, "logits/rejected": -2.8125, "logps/chosen": -668.0, "logps/rejected": -1192.0, "loss": 0.2267, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.96875, "rewards/margins": 5.21875, "rewards/rejected": -10.1875, "step": 24540 }, { "epoch": 0.9745350614294505, "grad_norm": 26.178738468054885, "learning_rate": 9.855959218880716e-10, "logits/chosen": -2.6875, "logits/rejected": -3.09375, "logps/chosen": -680.0, "logps/rejected": -1176.0, "loss": 0.2043, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 5.25, "rewards/rejected": -10.125, "step": 24550 }, { "epoch": 0.974932020721275, "grad_norm": 34.37125194994976, "learning_rate": 9.55103416008185e-10, "logits/chosen": -2.734375, "logits/rejected": -3.0, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.2361, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.28125, "rewards/rejected": -10.25, "step": 24560 }, { "epoch": 0.9753289800130996, "grad_norm": 37.860070026061294, "learning_rate": 9.25089139194618e-10, "logits/chosen": -2.875, "logits/rejected": -3.28125, "logps/chosen": -696.0, "logps/rejected": -1152.0, "loss": 0.2, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.25, "rewards/margins": 4.8125, "rewards/rejected": -10.0625, "step": 24570 }, { "epoch": 0.9757259393049242, "grad_norm": 38.51176471396085, "learning_rate": 8.955531490824341e-10, "logits/chosen": -2.765625, "logits/rejected": -2.703125, "logps/chosen": -700.0, "logps/rejected": -1200.0, "loss": 0.2109, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.25, "rewards/margins": 5.09375, "rewards/rejected": -10.375, "step": 24580 }, { "epoch": 0.976122898596749, "grad_norm": 23.878491869855086, "learning_rate": 8.664955023881259e-10, "logits/chosen": -2.609375, "logits/rejected": -2.875, "logps/chosen": -664.0, "logps/rejected": -1224.0, "loss": 0.2269, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 5.4375, "rewards/rejected": -10.5, "step": 24590 }, { "epoch": 0.9765198578885735, "grad_norm": 23.352750887987188, "learning_rate": 8.379162549097541e-10, "logits/chosen": -2.640625, "logits/rejected": -2.984375, "logps/chosen": -700.0, "logps/rejected": -1192.0, "loss": 0.235, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.09375, "rewards/margins": 5.15625, "rewards/rejected": -10.25, "step": 24600 }, { "epoch": 0.9769168171803981, "grad_norm": 22.25614495110942, "learning_rate": 8.098154615266695e-10, "logits/chosen": -2.75, "logits/rejected": -2.90625, "logps/chosen": -688.0, "logps/rejected": -1176.0, "loss": 0.2172, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.875, "rewards/rejected": -10.0, "step": 24610 }, { "epoch": 0.9773137764722227, "grad_norm": 30.462248711865417, "learning_rate": 7.82193176199486e-10, "logits/chosen": -2.625, "logits/rejected": -3.0, "logps/chosen": -680.0, "logps/rejected": -1208.0, "loss": 0.2168, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.96875, "rewards/margins": 5.40625, "rewards/rejected": -10.375, "step": 24620 }, { "epoch": 0.9777107357640474, "grad_norm": 21.42940861711264, "learning_rate": 7.550494519699968e-10, "logits/chosen": -2.625, "logits/rejected": -3.140625, "logps/chosen": -644.0, "logps/rejected": -1176.0, "loss": 0.2082, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.71875, "rewards/margins": 5.5, "rewards/rejected": -10.1875, "step": 24630 }, { "epoch": 0.978107695055872, "grad_norm": 19.754162017237235, "learning_rate": 7.283843409609525e-10, "logits/chosen": -2.75, "logits/rejected": -2.84375, "logps/chosen": -680.0, "logps/rejected": -1200.0, "loss": 0.2252, "rewards/accuracies": 0.96875, "rewards/chosen": -5.09375, "rewards/margins": 5.15625, "rewards/rejected": -10.25, "step": 24640 }, { "epoch": 0.9785046543476966, "grad_norm": 24.728139405407916, "learning_rate": 7.021978943761164e-10, "logits/chosen": -2.609375, "logits/rejected": -3.1875, "logps/chosen": -676.0, "logps/rejected": -1248.0, "loss": 0.183, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.90625, "rewards/rejected": -10.875, "step": 24650 }, { "epoch": 0.9789016136395212, "grad_norm": 28.126449673923595, "learning_rate": 6.764901625000985e-10, "logits/chosen": -2.671875, "logits/rejected": -3.125, "logps/chosen": -656.0, "logps/rejected": -1152.0, "loss": 0.2113, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 24660 }, { "epoch": 0.9792985729313459, "grad_norm": 17.250478292800338, "learning_rate": 6.512611946981883e-10, "logits/chosen": -2.796875, "logits/rejected": -3.078125, "logps/chosen": -656.0, "logps/rejected": -1208.0, "loss": 0.2023, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.53125, "rewards/rejected": -10.5, "step": 24670 }, { "epoch": 0.9796955322231705, "grad_norm": 20.26202609275382, "learning_rate": 6.265110394164386e-10, "logits/chosen": -2.546875, "logits/rejected": -2.921875, "logps/chosen": -700.0, "logps/rejected": -1192.0, "loss": 0.2334, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.0625, "rewards/margins": 5.0, "rewards/rejected": -10.0625, "step": 24680 }, { "epoch": 0.9800924915149951, "grad_norm": 18.577962998442388, "learning_rate": 6.022397441813321e-10, "logits/chosen": -2.734375, "logits/rejected": -3.0625, "logps/chosen": -652.0, "logps/rejected": -1176.0, "loss": 0.2146, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.375, "rewards/rejected": -10.1875, "step": 24690 }, { "epoch": 0.9804894508068197, "grad_norm": 30.398321573820283, "learning_rate": 5.784473555999203e-10, "logits/chosen": -2.8125, "logits/rejected": -3.03125, "logps/chosen": -684.0, "logps/rejected": -1184.0, "loss": 0.2195, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.1875, "rewards/margins": 5.09375, "rewards/rejected": -10.25, "step": 24700 }, { "epoch": 0.9808864100986444, "grad_norm": 27.33424192076559, "learning_rate": 5.551339193595738e-10, "logits/chosen": -2.71875, "logits/rejected": -2.953125, "logps/chosen": -648.0, "logps/rejected": -1152.0, "loss": 0.242, "rewards/accuracies": 0.9375, "rewards/chosen": -4.9375, "rewards/margins": 5.0625, "rewards/rejected": -10.0, "step": 24710 }, { "epoch": 0.981283369390469, "grad_norm": 20.332098110022763, "learning_rate": 5.322994802279823e-10, "logits/chosen": -2.734375, "logits/rejected": -2.78125, "logps/chosen": -672.0, "logps/rejected": -1160.0, "loss": 0.2125, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 4.9375, "rewards/rejected": -9.875, "step": 24720 }, { "epoch": 0.9816803286822936, "grad_norm": 30.579706982247803, "learning_rate": 5.099440820530431e-10, "logits/chosen": -2.640625, "logits/rejected": -2.890625, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.2006, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.8125, "rewards/margins": 5.0625, "rewards/rejected": -9.875, "step": 24730 }, { "epoch": 0.9820772879741183, "grad_norm": 21.02307277147712, "learning_rate": 4.880677677627509e-10, "logits/chosen": -2.71875, "logits/rejected": -2.859375, "logps/chosen": -664.0, "logps/rejected": -1184.0, "loss": 0.1918, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -4.96875, "rewards/margins": 5.09375, "rewards/rejected": -10.0625, "step": 24740 }, { "epoch": 0.9824742472659429, "grad_norm": 37.37454482684648, "learning_rate": 4.666705793651416e-10, "logits/chosen": -2.65625, "logits/rejected": -2.890625, "logps/chosen": -704.0, "logps/rejected": -1160.0, "loss": 0.2502, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.1875, "rewards/margins": 4.71875, "rewards/rejected": -9.9375, "step": 24750 }, { "epoch": 0.9828712065577675, "grad_norm": 23.968146269871294, "learning_rate": 4.457525579482369e-10, "logits/chosen": -2.703125, "logits/rejected": -2.84375, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.2233, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 5.3125, "rewards/rejected": -10.1875, "step": 24760 }, { "epoch": 0.9832681658495921, "grad_norm": 16.68749321171083, "learning_rate": 4.253137436799059e-10, "logits/chosen": -2.75, "logits/rejected": -3.25, "logps/chosen": -680.0, "logps/rejected": -1168.0, "loss": 0.2052, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.0625, "rewards/margins": 5.09375, "rewards/rejected": -10.125, "step": 24770 }, { "epoch": 0.9836651251414168, "grad_norm": 42.937520227872525, "learning_rate": 4.053541758078649e-10, "logits/chosen": -2.59375, "logits/rejected": -2.71875, "logps/chosen": -672.0, "logps/rejected": -1184.0, "loss": 0.2071, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.9375, "rewards/margins": 5.0, "rewards/rejected": -9.9375, "step": 24780 }, { "epoch": 0.9840620844332414, "grad_norm": 24.90628312233211, "learning_rate": 3.858738926594829e-10, "logits/chosen": -2.734375, "logits/rejected": -2.921875, "logps/chosen": -720.0, "logps/rejected": -1176.0, "loss": 0.1976, "rewards/accuracies": 0.9375, "rewards/chosen": -5.46875, "rewards/margins": 4.40625, "rewards/rejected": -9.875, "step": 24790 }, { "epoch": 0.984459043725066, "grad_norm": 31.586806774890277, "learning_rate": 3.6687293164180955e-10, "logits/chosen": -2.75, "logits/rejected": -3.03125, "logps/chosen": -676.0, "logps/rejected": -1176.0, "loss": 0.1938, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.1875, "rewards/margins": 5.03125, "rewards/rejected": -10.1875, "step": 24800 }, { "epoch": 0.9848560030168906, "grad_norm": 27.6874459318448, "learning_rate": 3.483513292415474e-10, "logits/chosen": -2.625, "logits/rejected": -2.859375, "logps/chosen": -664.0, "logps/rejected": -1192.0, "loss": 0.2065, "rewards/accuracies": 0.96875, "rewards/chosen": -4.84375, "rewards/margins": 5.4375, "rewards/rejected": -10.25, "step": 24810 }, { "epoch": 0.9852529623087153, "grad_norm": 26.857968492833844, "learning_rate": 3.303091210247744e-10, "logits/chosen": -2.625, "logits/rejected": -3.015625, "logps/chosen": -704.0, "logps/rejected": -1216.0, "loss": 0.2405, "rewards/accuracies": 0.9375, "rewards/chosen": -5.125, "rewards/margins": 5.3125, "rewards/rejected": -10.4375, "step": 24820 }, { "epoch": 0.9856499216005399, "grad_norm": 30.698576101318512, "learning_rate": 3.127463416371379e-10, "logits/chosen": -2.6875, "logits/rejected": -3.078125, "logps/chosen": -668.0, "logps/rejected": -1176.0, "loss": 0.1948, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.78125, "rewards/margins": 5.3125, "rewards/rejected": -10.0625, "step": 24830 }, { "epoch": 0.9860468808923645, "grad_norm": 20.564275310940484, "learning_rate": 2.9566302480360517e-10, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.2033, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.75, "rewards/margins": 5.21875, "rewards/rejected": -10.0, "step": 24840 }, { "epoch": 0.9864438401841891, "grad_norm": 26.609230497008276, "learning_rate": 2.790592033284356e-10, "logits/chosen": -2.78125, "logits/rejected": -2.984375, "logps/chosen": -732.0, "logps/rejected": -1216.0, "loss": 0.2207, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.5625, "rewards/margins": 4.84375, "rewards/rejected": -10.375, "step": 24850 }, { "epoch": 0.9868407994760138, "grad_norm": 26.954192436690693, "learning_rate": 2.629349090951527e-10, "logits/chosen": -2.640625, "logits/rejected": -2.984375, "logps/chosen": -676.0, "logps/rejected": -1184.0, "loss": 0.1914, "rewards/accuracies": 0.96875, "rewards/chosen": -5.125, "rewards/margins": 5.15625, "rewards/rejected": -10.25, "step": 24860 }, { "epoch": 0.9872377587678384, "grad_norm": 22.98432516650995, "learning_rate": 2.4729017306648893e-10, "logits/chosen": -2.65625, "logits/rejected": -3.046875, "logps/chosen": -704.0, "logps/rejected": -1168.0, "loss": 0.2244, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.1875, "rewards/margins": 5.0, "rewards/rejected": -10.1875, "step": 24870 }, { "epoch": 0.987634718059663, "grad_norm": 17.532257138520492, "learning_rate": 2.3212502528427436e-10, "logits/chosen": -2.765625, "logits/rejected": -3.09375, "logps/chosen": -676.0, "logps/rejected": -1184.0, "loss": 0.1861, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -5.0, "rewards/margins": 5.1875, "rewards/rejected": -10.1875, "step": 24880 }, { "epoch": 0.9880316773514876, "grad_norm": 27.358652431234947, "learning_rate": 2.174394948694369e-10, "logits/chosen": -2.625, "logits/rejected": -2.8125, "logps/chosen": -688.0, "logps/rejected": -1200.0, "loss": 0.2184, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -5.09375, "rewards/margins": 5.125, "rewards/rejected": -10.25, "step": 24890 }, { "epoch": 0.9884286366433123, "grad_norm": 21.964192802691286, "learning_rate": 2.0323361002189121e-10, "logits/chosen": -2.84375, "logits/rejected": -3.09375, "logps/chosen": -696.0, "logps/rejected": -1168.0, "loss": 0.2101, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.28125, "rewards/margins": 4.75, "rewards/rejected": -10.0, "step": 24900 }, { "epoch": 0.9888255959351369, "grad_norm": 22.764489236851716, "learning_rate": 1.8950739802051086e-10, "logits/chosen": -2.8125, "logits/rejected": -2.9375, "logps/chosen": -684.0, "logps/rejected": -1152.0, "loss": 0.1919, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.15625, "rewards/margins": 4.65625, "rewards/rejected": -9.8125, "step": 24910 }, { "epoch": 0.9892225552269615, "grad_norm": 23.47177910693439, "learning_rate": 1.7626088522312844e-10, "logits/chosen": -2.5625, "logits/rejected": -2.828125, "logps/chosen": -700.0, "logps/rejected": -1184.0, "loss": 0.1751, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.1875, "rewards/margins": 5.0, "rewards/rejected": -10.1875, "step": 24920 }, { "epoch": 0.989619514518786, "grad_norm": 24.560077100113627, "learning_rate": 1.634940970663412e-10, "logits/chosen": -2.6875, "logits/rejected": -2.953125, "logps/chosen": -664.0, "logps/rejected": -1144.0, "loss": 0.2363, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 4.90625, "rewards/rejected": -9.8125, "step": 24930 }, { "epoch": 0.9900164738106108, "grad_norm": 30.140321881575794, "learning_rate": 1.5120705806562218e-10, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -648.0, "logps/rejected": -1192.0, "loss": 0.2017, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -4.78125, "rewards/margins": 5.34375, "rewards/rejected": -10.125, "step": 24940 }, { "epoch": 0.9904134331024353, "grad_norm": 22.631407201847043, "learning_rate": 1.39399791815209e-10, "logits/chosen": -2.65625, "logits/rejected": -2.96875, "logps/chosen": -668.0, "logps/rejected": -1184.0, "loss": 0.2217, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.40625, "rewards/rejected": -10.3125, "step": 24950 }, { "epoch": 0.9908103923942599, "grad_norm": 31.217664759978142, "learning_rate": 1.280723209880208e-10, "logits/chosen": -2.671875, "logits/rejected": -2.71875, "logps/chosen": -692.0, "logps/rejected": -1192.0, "loss": 0.196, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.0625, "rewards/margins": 5.125, "rewards/rejected": -10.1875, "step": 24960 }, { "epoch": 0.9912073516860845, "grad_norm": 23.391192228456436, "learning_rate": 1.1722466733563031e-10, "logits/chosen": -2.71875, "logits/rejected": -2.828125, "logps/chosen": -652.0, "logps/rejected": -1168.0, "loss": 0.2187, "rewards/accuracies": 0.96875, "rewards/chosen": -4.90625, "rewards/margins": 5.09375, "rewards/rejected": -10.0, "step": 24970 }, { "epoch": 0.9916043109779092, "grad_norm": 16.322424702840713, "learning_rate": 1.0685685168831948e-10, "logits/chosen": -2.765625, "logits/rejected": -2.90625, "logps/chosen": -644.0, "logps/rejected": -1192.0, "loss": 0.2004, "rewards/accuracies": 0.96875, "rewards/chosen": -4.75, "rewards/margins": 5.46875, "rewards/rejected": -10.25, "step": 24980 }, { "epoch": 0.9920012702697338, "grad_norm": 18.95934524513289, "learning_rate": 9.696889395488517e-11, "logits/chosen": -2.734375, "logits/rejected": -2.90625, "logps/chosen": -660.0, "logps/rejected": -1208.0, "loss": 0.2017, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.90625, "rewards/margins": 5.5, "rewards/rejected": -10.4375, "step": 24990 }, { "epoch": 0.9923982295615584, "grad_norm": 28.619395659506313, "learning_rate": 8.756081312272234e-11, "logits/chosen": -2.609375, "logits/rejected": -2.921875, "logps/chosen": -668.0, "logps/rejected": -1184.0, "loss": 0.1784, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.875, "rewards/margins": 5.375, "rewards/rejected": -10.25, "step": 25000 }, { "epoch": 0.9923982295615584, "eval_logits/chosen": -2.703125, "eval_logits/rejected": -2.9375, "eval_logps/chosen": -716.0, "eval_logps/rejected": -1128.0, "eval_loss": 0.25169482827186584, "eval_rewards/accuracies": 0.8951539993286133, "eval_rewards/chosen": -5.3125, "eval_rewards/margins": 4.28125, "eval_rewards/rejected": -9.625, "eval_runtime": 5403.2457, "eval_samples_per_second": 32.691, "eval_steps_per_second": 0.511, "step": 25000 }, { "epoch": 0.9927951888533831, "grad_norm": 15.381551955074212, "learning_rate": 7.863262725768537e-11, "logits/chosen": -2.59375, "logits/rejected": -2.78125, "logps/chosen": -672.0, "logps/rejected": -1224.0, "loss": 0.2106, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -4.9375, "rewards/margins": 5.375, "rewards/rejected": -10.3125, "step": 25010 }, { "epoch": 0.9931921481452077, "grad_norm": 21.678919414016914, "learning_rate": 7.018435350417128e-11, "logits/chosen": -2.671875, "logits/rejected": -2.796875, "logps/chosen": -636.0, "logps/rejected": -1184.0, "loss": 0.2336, "rewards/accuracies": 0.96875, "rewards/chosen": -4.65625, "rewards/margins": 5.53125, "rewards/rejected": -10.1875, "step": 25020 }, { "epoch": 0.9935891074370323, "grad_norm": 28.043667692029448, "learning_rate": 6.221600808500871e-11, "logits/chosen": -2.5625, "logits/rejected": -2.859375, "logps/chosen": -664.0, "logps/rejected": -1208.0, "loss": 0.1733, "rewards/accuracies": 0.96875, "rewards/chosen": -4.875, "rewards/margins": 5.59375, "rewards/rejected": -10.4375, "step": 25030 }, { "epoch": 0.9939860667288569, "grad_norm": 34.73792827714091, "learning_rate": 5.472760630140239e-11, "logits/chosen": -2.75, "logits/rejected": -3.0, "logps/chosen": -672.0, "logps/rejected": -1168.0, "loss": 0.2402, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.875, "rewards/margins": 5.03125, "rewards/rejected": -9.875, "step": 25040 }, { "epoch": 0.9943830260206816, "grad_norm": 29.721743679526963, "learning_rate": 4.771916253298869e-11, "logits/chosen": -2.765625, "logits/rejected": -3.03125, "logps/chosen": -640.0, "logps/rejected": -1208.0, "loss": 0.1735, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.53125, "rewards/rejected": -10.3125, "step": 25050 }, { "epoch": 0.9947799853125062, "grad_norm": 16.37876638987048, "learning_rate": 4.119069023775235e-11, "logits/chosen": -2.59375, "logits/rejected": -2.71875, "logps/chosen": -688.0, "logps/rejected": -1224.0, "loss": 0.1896, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.03125, "rewards/margins": 5.40625, "rewards/rejected": -10.4375, "step": 25060 }, { "epoch": 0.9951769446043308, "grad_norm": 21.495494476324858, "learning_rate": 3.514220195199868e-11, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -688.0, "logps/rejected": -1152.0, "loss": 0.2057, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.09375, "rewards/margins": 4.75, "rewards/rejected": -9.875, "step": 25070 }, { "epoch": 0.9955739038961554, "grad_norm": 30.807865680858505, "learning_rate": 2.957370929035363e-11, "logits/chosen": -2.625, "logits/rejected": -2.875, "logps/chosen": -656.0, "logps/rejected": -1168.0, "loss": 0.1891, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.84375, "rewards/margins": 5.1875, "rewards/rejected": -10.0, "step": 25080 }, { "epoch": 0.9959708631879801, "grad_norm": 32.94565962302872, "learning_rate": 2.4485222945735963e-11, "logits/chosen": -2.75, "logits/rejected": -2.984375, "logps/chosen": -684.0, "logps/rejected": -1160.0, "loss": 0.2177, "rewards/accuracies": 0.96875, "rewards/chosen": -5.25, "rewards/margins": 4.6875, "rewards/rejected": -9.9375, "step": 25090 }, { "epoch": 0.9963678224798047, "grad_norm": 19.071476975302627, "learning_rate": 1.987675268930178e-11, "logits/chosen": -2.703125, "logits/rejected": -3.078125, "logps/chosen": -656.0, "logps/rejected": -1208.0, "loss": 0.1695, "rewards/accuracies": 0.96875, "rewards/chosen": -4.8125, "rewards/margins": 5.65625, "rewards/rejected": -10.4375, "step": 25100 }, { "epoch": 0.9967647817716293, "grad_norm": 24.280371364542393, "learning_rate": 1.574830737052779e-11, "logits/chosen": -2.671875, "logits/rejected": -3.078125, "logps/chosen": -680.0, "logps/rejected": -1152.0, "loss": 0.2011, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -4.9375, "rewards/margins": 4.96875, "rewards/rejected": -9.875, "step": 25110 }, { "epoch": 0.9971617410634539, "grad_norm": 29.787824043630906, "learning_rate": 1.2099894917017018e-11, "logits/chosen": -2.828125, "logits/rejected": -3.015625, "logps/chosen": -668.0, "logps/rejected": -1168.0, "loss": 0.2081, "rewards/accuracies": 0.96875, "rewards/chosen": -5.0625, "rewards/margins": 4.96875, "rewards/rejected": -10.0625, "step": 25120 }, { "epoch": 0.9975587003552786, "grad_norm": 22.812409406799127, "learning_rate": 8.93152233466532e-12, "logits/chosen": -2.671875, "logits/rejected": -2.703125, "logps/chosen": -676.0, "logps/rejected": -1192.0, "loss": 0.1971, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.03125, "rewards/margins": 5.0625, "rewards/rejected": -10.125, "step": 25130 }, { "epoch": 0.9979556596471032, "grad_norm": 31.538722015996342, "learning_rate": 6.243195707550386e-12, "logits/chosen": -2.734375, "logits/rejected": -2.828125, "logps/chosen": -712.0, "logps/rejected": -1224.0, "loss": 0.2218, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.34375, "rewards/margins": 5.28125, "rewards/rejected": -10.625, "step": 25140 }, { "epoch": 0.9983526189389278, "grad_norm": 34.48387522469746, "learning_rate": 4.034920197931723e-12, "logits/chosen": -2.75, "logits/rejected": -2.890625, "logps/chosen": -688.0, "logps/rejected": -1184.0, "loss": 0.2105, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.03125, "rewards/margins": 5.0, "rewards/rejected": -10.0625, "step": 25150 }, { "epoch": 0.9987495782307524, "grad_norm": 16.553813631563763, "learning_rate": 2.306700046250665e-12, "logits/chosen": -2.65625, "logits/rejected": -2.859375, "logps/chosen": -688.0, "logps/rejected": -1208.0, "loss": 0.2007, "rewards/accuracies": 0.956250011920929, "rewards/chosen": -5.09375, "rewards/margins": 5.25, "rewards/rejected": -10.3125, "step": 25160 }, { "epoch": 0.9991465375225771, "grad_norm": 26.458443863582193, "learning_rate": 1.058538571130363e-12, "logits/chosen": -2.59375, "logits/rejected": -2.625, "logps/chosen": -688.0, "logps/rejected": -1216.0, "loss": 0.2285, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.15625, "rewards/margins": 5.125, "rewards/rejected": -10.25, "step": 25170 }, { "epoch": 0.9995434968144017, "grad_norm": 40.96175521702255, "learning_rate": 2.9043816937579513e-13, "logits/chosen": -2.8125, "logits/rejected": -2.9375, "logps/chosen": -664.0, "logps/rejected": -1152.0, "loss": 0.2426, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -4.90625, "rewards/margins": 4.9375, "rewards/rejected": -9.8125, "step": 25180 }, { "epoch": 0.9999404561062263, "grad_norm": 31.233834071264564, "learning_rate": 2.400315918249518e-15, "logits/chosen": -2.625, "logits/rejected": -2.859375, "logps/chosen": -700.0, "logps/rejected": -1192.0, "loss": 0.217, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.21875, "rewards/margins": 5.0625, "rewards/rejected": -10.25, "step": 25190 }, { "epoch": 0.9999801520354088, "step": 25191, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 25.8505, "train_samples_per_second": 124735.05, "train_steps_per_second": 974.487 } ], "logging_steps": 10, "max_steps": 25191, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }