diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8700 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 5733, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005232862375719519, + "grad_norm": 10.351622948893763, + "learning_rate": 8.710801393728224e-08, + "logits/chosen": -12.5625, + "logits/rejected": -11.6875, + "logps/chosen": -430.0, + "logps/rejected": -460.0, + "loss": 0.6914, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0052328623757195184, + "grad_norm": 9.861680749083718, + "learning_rate": 8.710801393728223e-07, + "logits/chosen": -11.25, + "logits/rejected": -11.3125, + "logps/chosen": -364.0, + "logps/rejected": -290.0, + "loss": 0.6937, + "rewards/accuracies": 0.125, + "rewards/chosen": 1.609325408935547e-05, + "rewards/margins": -0.00677490234375, + "rewards/rejected": 0.00677490234375, + "step": 10 + }, + { + "epoch": 0.010465724751439037, + "grad_norm": 8.879894065080062, + "learning_rate": 1.7421602787456445e-06, + "logits/chosen": -11.0, + "logits/rejected": -11.0625, + "logps/chosen": -264.0, + "logps/rejected": -256.0, + "loss": 0.691, + "rewards/accuracies": 0.25, + "rewards/chosen": -0.00103759765625, + "rewards/margins": 0.00665283203125, + "rewards/rejected": -0.0076904296875, + "step": 20 + }, + { + "epoch": 0.015698587127158554, + "grad_norm": 10.24487310428354, + "learning_rate": 2.613240418118467e-06, + "logits/chosen": -10.375, + "logits/rejected": -10.3125, + "logps/chosen": -326.0, + "logps/rejected": -318.0, + "loss": 0.6854, + "rewards/accuracies": 0.36250001192092896, + "rewards/chosen": 0.01275634765625, + "rewards/margins": 0.02587890625, + "rewards/rejected": -0.0130615234375, + "step": 30 + }, + { + "epoch": 0.020931449502878074, + "grad_norm": 9.307003472676271, + "learning_rate": 3.484320557491289e-06, + "logits/chosen": -11.3125, + "logits/rejected": -10.9375, + "logps/chosen": -336.0, + "logps/rejected": -310.0, + "loss": 0.6801, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.01080322265625, + "rewards/margins": 0.0126953125, + "rewards/rejected": -0.0235595703125, + "step": 40 + }, + { + "epoch": 0.026164311878597593, + "grad_norm": 9.716648284939643, + "learning_rate": 4.355400696864112e-06, + "logits/chosen": -11.75, + "logits/rejected": -11.5, + "logps/chosen": -314.0, + "logps/rejected": -312.0, + "loss": 0.6714, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.03125, + "rewards/margins": 0.05859375, + "rewards/rejected": -0.08984375, + "step": 50 + }, + { + "epoch": 0.03139717425431711, + "grad_norm": 9.029736646123915, + "learning_rate": 5.226480836236934e-06, + "logits/chosen": -11.6875, + "logits/rejected": -11.3125, + "logps/chosen": -330.0, + "logps/rejected": -320.0, + "loss": 0.6424, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.103515625, + "rewards/margins": 0.06884765625, + "rewards/rejected": -0.171875, + "step": 60 + }, + { + "epoch": 0.03663003663003663, + "grad_norm": 10.22440855937027, + "learning_rate": 6.0975609756097564e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.75, + "logps/chosen": -374.0, + "logps/rejected": -332.0, + "loss": 0.629, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.369140625, + "rewards/margins": 0.189453125, + "rewards/rejected": -0.55859375, + "step": 70 + }, + { + "epoch": 0.04186289900575615, + "grad_norm": 9.387356779175425, + "learning_rate": 6.968641114982578e-06, + "logits/chosen": -13.0, + "logits/rejected": -13.0, + "logps/chosen": -326.0, + "logps/rejected": -318.0, + "loss": 0.6411, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.5859375, + "rewards/margins": 0.201171875, + "rewards/rejected": -0.78515625, + "step": 80 + }, + { + "epoch": 0.04709576138147567, + "grad_norm": 8.674280226009019, + "learning_rate": 7.8397212543554e-06, + "logits/chosen": -12.1875, + "logits/rejected": -11.75, + "logps/chosen": -296.0, + "logps/rejected": -274.0, + "loss": 0.655, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.3125, + "rewards/margins": 0.17578125, + "rewards/rejected": -0.48828125, + "step": 90 + }, + { + "epoch": 0.052328623757195186, + "grad_norm": 11.202378620141069, + "learning_rate": 8.710801393728225e-06, + "logits/chosen": -12.1875, + "logits/rejected": -11.25, + "logps/chosen": -352.0, + "logps/rejected": -282.0, + "loss": 0.616, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -0.181640625, + "rewards/margins": 0.322265625, + "rewards/rejected": -0.50390625, + "step": 100 + }, + { + "epoch": 0.0575614861329147, + "grad_norm": 8.409262794688129, + "learning_rate": 9.581881533101046e-06, + "logits/chosen": -11.4375, + "logits/rejected": -10.9375, + "logps/chosen": -314.0, + "logps/rejected": -284.0, + "loss": 0.6012, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -0.30078125, + "rewards/margins": 0.41796875, + "rewards/rejected": -0.71875, + "step": 110 + }, + { + "epoch": 0.06279434850863422, + "grad_norm": 10.27018949715906, + "learning_rate": 1.0452961672473868e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.75, + "logps/chosen": -420.0, + "logps/rejected": -374.0, + "loss": 0.6532, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": -0.494140625, + "rewards/margins": 0.2578125, + "rewards/rejected": -0.75, + "step": 120 + }, + { + "epoch": 0.06802721088435375, + "grad_norm": 8.585428960670738, + "learning_rate": 1.132404181184669e-05, + "logits/chosen": -12.125, + "logits/rejected": -11.25, + "logps/chosen": -280.0, + "logps/rejected": -286.0, + "loss": 0.6281, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5390625, + "rewards/margins": 0.29296875, + "rewards/rejected": -0.83203125, + "step": 130 + }, + { + "epoch": 0.07326007326007326, + "grad_norm": 11.197641077169887, + "learning_rate": 1.2195121951219513e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.625, + "logps/chosen": -356.0, + "logps/rejected": -334.0, + "loss": 0.5618, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": -0.8515625, + "rewards/margins": 0.5546875, + "rewards/rejected": -1.40625, + "step": 140 + }, + { + "epoch": 0.07849293563579278, + "grad_norm": 9.57565914517034, + "learning_rate": 1.3066202090592336e-05, + "logits/chosen": -13.4375, + "logits/rejected": -13.3125, + "logps/chosen": -352.0, + "logps/rejected": -320.0, + "loss": 0.6211, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.2421875, + "rewards/margins": 0.48046875, + "rewards/rejected": -1.71875, + "step": 150 + }, + { + "epoch": 0.0837257980115123, + "grad_norm": 10.73566421003925, + "learning_rate": 1.3937282229965156e-05, + "logits/chosen": -13.375, + "logits/rejected": -13.625, + "logps/chosen": -368.0, + "logps/rejected": -322.0, + "loss": 0.626, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.359375, + "rewards/margins": 0.63671875, + "rewards/rejected": -2.0, + "step": 160 + }, + { + "epoch": 0.08895866038723181, + "grad_norm": 8.935497274328549, + "learning_rate": 1.4808362369337981e-05, + "logits/chosen": -13.5625, + "logits/rejected": -13.375, + "logps/chosen": -348.0, + "logps/rejected": -354.0, + "loss": 0.6129, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -1.8046875, + "rewards/margins": 0.09375, + "rewards/rejected": -1.8984375, + "step": 170 + }, + { + "epoch": 0.09419152276295134, + "grad_norm": 6.705738405802169, + "learning_rate": 1.56794425087108e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.1875, + "logps/chosen": -306.0, + "logps/rejected": -320.0, + "loss": 0.6128, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.6328125, + "rewards/margins": 0.3046875, + "rewards/rejected": -1.9375, + "step": 180 + }, + { + "epoch": 0.09942438513867086, + "grad_norm": 9.341880358696546, + "learning_rate": 1.6550522648083624e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.25, + "logps/chosen": -416.0, + "logps/rejected": -352.0, + "loss": 0.5737, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -1.6953125, + "rewards/margins": 0.53125, + "rewards/rejected": -2.21875, + "step": 190 + }, + { + "epoch": 0.10465724751439037, + "grad_norm": 8.99823436974031, + "learning_rate": 1.742160278745645e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.8125, + "logps/chosen": -362.0, + "logps/rejected": -358.0, + "loss": 0.6059, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.765625, + "rewards/margins": 0.361328125, + "rewards/rejected": -2.125, + "step": 200 + }, + { + "epoch": 0.10989010989010989, + "grad_norm": 8.121852231192932, + "learning_rate": 1.8292682926829268e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.75, + "logps/chosen": -310.0, + "logps/rejected": -288.0, + "loss": 0.6293, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.5078125, + "rewards/margins": 0.296875, + "rewards/rejected": -1.796875, + "step": 210 + }, + { + "epoch": 0.1151229722658294, + "grad_norm": 8.331199806773945, + "learning_rate": 1.9163763066202093e-05, + "logits/chosen": -13.25, + "logits/rejected": -12.6875, + "logps/chosen": -330.0, + "logps/rejected": -336.0, + "loss": 0.5854, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.7109375, + "rewards/margins": 0.49609375, + "rewards/rejected": -2.203125, + "step": 220 + }, + { + "epoch": 0.12035583464154893, + "grad_norm": 9.385440280022902, + "learning_rate": 2.0034843205574914e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.625, + "logps/chosen": -378.0, + "logps/rejected": -412.0, + "loss": 0.5854, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.9140625, + "rewards/margins": 0.53515625, + "rewards/rejected": -2.453125, + "step": 230 + }, + { + "epoch": 0.12558869701726844, + "grad_norm": 7.856024282920787, + "learning_rate": 2.0905923344947736e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.0625, + "logps/chosen": -324.0, + "logps/rejected": -332.0, + "loss": 0.6277, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -1.7890625, + "rewards/margins": 0.6328125, + "rewards/rejected": -2.421875, + "step": 240 + }, + { + "epoch": 0.13082155939298795, + "grad_norm": 10.33844224138613, + "learning_rate": 2.1777003484320557e-05, + "logits/chosen": -11.5, + "logits/rejected": -11.0625, + "logps/chosen": -376.0, + "logps/rejected": -340.0, + "loss": 0.6369, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -1.5234375, + "rewards/margins": 0.58203125, + "rewards/rejected": -2.109375, + "step": 250 + }, + { + "epoch": 0.1360544217687075, + "grad_norm": 10.80456220320549, + "learning_rate": 2.264808362369338e-05, + "logits/chosen": -11.4375, + "logits/rejected": -10.9375, + "logps/chosen": -356.0, + "logps/rejected": -328.0, + "loss": 0.6055, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.53125, + "rewards/margins": 0.68359375, + "rewards/rejected": -2.21875, + "step": 260 + }, + { + "epoch": 0.141287284144427, + "grad_norm": 7.153194193188069, + "learning_rate": 2.3519163763066204e-05, + "logits/chosen": -11.0625, + "logits/rejected": -10.875, + "logps/chosen": -350.0, + "logps/rejected": -346.0, + "loss": 0.5423, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.1015625, + "rewards/margins": 0.60546875, + "rewards/rejected": -1.703125, + "step": 270 + }, + { + "epoch": 0.14652014652014653, + "grad_norm": 10.212050819236445, + "learning_rate": 2.4390243902439026e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.5, + "logps/chosen": -384.0, + "logps/rejected": -360.0, + "loss": 0.6549, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.484375, + "rewards/margins": 0.609375, + "rewards/rejected": -2.09375, + "step": 280 + }, + { + "epoch": 0.15175300889586604, + "grad_norm": 8.60794177402858, + "learning_rate": 2.5261324041811847e-05, + "logits/chosen": -13.5, + "logits/rejected": -13.0625, + "logps/chosen": -368.0, + "logps/rejected": -322.0, + "loss": 0.6284, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -1.2734375, + "rewards/margins": 0.65625, + "rewards/rejected": -1.9296875, + "step": 290 + }, + { + "epoch": 0.15698587127158556, + "grad_norm": 8.37696998776307, + "learning_rate": 2.6132404181184672e-05, + "logits/chosen": -14.25, + "logits/rejected": -13.875, + "logps/chosen": -364.0, + "logps/rejected": -330.0, + "loss": 0.618, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -1.4921875, + "rewards/margins": 0.59765625, + "rewards/rejected": -2.09375, + "step": 300 + }, + { + "epoch": 0.16221873364730507, + "grad_norm": 8.375659905100978, + "learning_rate": 2.7003484320557494e-05, + "logits/chosen": -13.1875, + "logits/rejected": -12.875, + "logps/chosen": -374.0, + "logps/rejected": -334.0, + "loss": 0.6298, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -1.8828125, + "rewards/margins": 0.5625, + "rewards/rejected": -2.4375, + "step": 310 + }, + { + "epoch": 0.1674515960230246, + "grad_norm": 10.402832073203621, + "learning_rate": 2.7874564459930312e-05, + "logits/chosen": -14.1875, + "logits/rejected": -13.75, + "logps/chosen": -416.0, + "logps/rejected": -342.0, + "loss": 0.5864, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -2.21875, + "rewards/margins": 0.578125, + "rewards/rejected": -2.796875, + "step": 320 + }, + { + "epoch": 0.1726844583987441, + "grad_norm": 12.297870357791856, + "learning_rate": 2.874564459930314e-05, + "logits/chosen": -13.6875, + "logits/rejected": -13.4375, + "logps/chosen": -368.0, + "logps/rejected": -332.0, + "loss": 0.7922, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.421875, + "rewards/margins": 0.859375, + "rewards/rejected": -3.28125, + "step": 330 + }, + { + "epoch": 0.17791732077446362, + "grad_norm": 9.64969634838185, + "learning_rate": 2.9616724738675962e-05, + "logits/chosen": -13.5, + "logits/rejected": -13.125, + "logps/chosen": -374.0, + "logps/rejected": -350.0, + "loss": 0.6487, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.59375, + "rewards/margins": 0.7265625, + "rewards/rejected": -3.328125, + "step": 340 + }, + { + "epoch": 0.18315018315018314, + "grad_norm": 19.1161905223819, + "learning_rate": 3.048780487804878e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.125, + "logps/chosen": -292.0, + "logps/rejected": -316.0, + "loss": 0.6017, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -2.515625, + "rewards/margins": 0.78125, + "rewards/rejected": -3.296875, + "step": 350 + }, + { + "epoch": 0.18838304552590268, + "grad_norm": 12.072342717412461, + "learning_rate": 3.13588850174216e-05, + "logits/chosen": -13.75, + "logits/rejected": -13.4375, + "logps/chosen": -390.0, + "logps/rejected": -370.0, + "loss": 0.643, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -2.765625, + "rewards/margins": 0.640625, + "rewards/rejected": -3.40625, + "step": 360 + }, + { + "epoch": 0.1936159079016222, + "grad_norm": 8.321040961655044, + "learning_rate": 3.222996515679443e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.5625, + "logps/chosen": -344.0, + "logps/rejected": -326.0, + "loss": 0.7638, + "rewards/accuracies": 0.6875, + "rewards/chosen": -2.6875, + "rewards/margins": 0.474609375, + "rewards/rejected": -3.15625, + "step": 370 + }, + { + "epoch": 0.1988487702773417, + "grad_norm": 7.6849275188322395, + "learning_rate": 3.310104529616725e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.3125, + "logps/chosen": -312.0, + "logps/rejected": -348.0, + "loss": 0.8002, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -2.015625, + "rewards/margins": 0.6015625, + "rewards/rejected": -2.609375, + "step": 380 + }, + { + "epoch": 0.20408163265306123, + "grad_norm": 9.09533864031769, + "learning_rate": 3.397212543554007e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.25, + "logps/chosen": -410.0, + "logps/rejected": -360.0, + "loss": 0.6596, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -2.34375, + "rewards/margins": 0.96875, + "rewards/rejected": -3.3125, + "step": 390 + }, + { + "epoch": 0.20931449502878074, + "grad_norm": 10.474251698898403, + "learning_rate": 3.48432055749129e-05, + "logits/chosen": -13.6875, + "logits/rejected": -13.375, + "logps/chosen": -434.0, + "logps/rejected": -364.0, + "loss": 0.6416, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -3.375, + "rewards/margins": 0.6875, + "rewards/rejected": -4.0625, + "step": 400 + }, + { + "epoch": 0.21454735740450026, + "grad_norm": 9.570607335225267, + "learning_rate": 3.571428571428572e-05, + "logits/chosen": -11.5, + "logits/rejected": -11.3125, + "logps/chosen": -414.0, + "logps/rejected": -408.0, + "loss": 0.7146, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.15625, + "rewards/margins": 0.416015625, + "rewards/rejected": -3.578125, + "step": 410 + }, + { + "epoch": 0.21978021978021978, + "grad_norm": 10.413809913164531, + "learning_rate": 3.6585365853658535e-05, + "logits/chosen": -11.3125, + "logits/rejected": -10.9375, + "logps/chosen": -380.0, + "logps/rejected": -358.0, + "loss": 0.6777, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -3.0625, + "rewards/margins": 0.25390625, + "rewards/rejected": -3.3125, + "step": 420 + }, + { + "epoch": 0.2250130821559393, + "grad_norm": 9.443240821693522, + "learning_rate": 3.745644599303136e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.25, + "logps/chosen": -388.0, + "logps/rejected": -318.0, + "loss": 0.6791, + "rewards/accuracies": 0.625, + "rewards/chosen": -3.0, + "rewards/margins": 0.6015625, + "rewards/rejected": -3.59375, + "step": 430 + }, + { + "epoch": 0.2302459445316588, + "grad_norm": 9.073494348342924, + "learning_rate": 3.8327526132404185e-05, + "logits/chosen": -11.5, + "logits/rejected": -11.125, + "logps/chosen": -416.0, + "logps/rejected": -348.0, + "loss": 0.6659, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -3.390625, + "rewards/margins": 0.392578125, + "rewards/rejected": -3.78125, + "step": 440 + }, + { + "epoch": 0.23547880690737832, + "grad_norm": 16.732406709817493, + "learning_rate": 3.9198606271777003e-05, + "logits/chosen": -10.4375, + "logits/rejected": -10.3125, + "logps/chosen": -352.0, + "logps/rejected": -360.0, + "loss": 0.7231, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -3.125, + "rewards/margins": 0.40234375, + "rewards/rejected": -3.53125, + "step": 450 + }, + { + "epoch": 0.24071166928309787, + "grad_norm": 9.62863918950795, + "learning_rate": 4.006968641114983e-05, + "logits/chosen": -10.625, + "logits/rejected": -9.9375, + "logps/chosen": -472.0, + "logps/rejected": -424.0, + "loss": 0.6618, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -4.28125, + "rewards/margins": 0.73828125, + "rewards/rejected": -5.03125, + "step": 460 + }, + { + "epoch": 0.24594453165881738, + "grad_norm": 13.659583611405836, + "learning_rate": 4.0940766550522653e-05, + "logits/chosen": -8.1875, + "logits/rejected": -7.875, + "logps/chosen": -444.0, + "logps/rejected": -428.0, + "loss": 0.7856, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -5.5625, + "rewards/margins": 0.609375, + "rewards/rejected": -6.15625, + "step": 470 + }, + { + "epoch": 0.25117739403453687, + "grad_norm": 11.696489303834198, + "learning_rate": 4.181184668989547e-05, + "logits/chosen": -7.65625, + "logits/rejected": -7.3125, + "logps/chosen": -380.0, + "logps/rejected": -368.0, + "loss": 0.6726, + "rewards/accuracies": 0.6875, + "rewards/chosen": -4.03125, + "rewards/margins": 0.609375, + "rewards/rejected": -4.65625, + "step": 480 + }, + { + "epoch": 0.2564102564102564, + "grad_norm": 9.07444220090164, + "learning_rate": 4.26829268292683e-05, + "logits/chosen": -8.9375, + "logits/rejected": -8.375, + "logps/chosen": -424.0, + "logps/rejected": -392.0, + "loss": 0.6257, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -4.125, + "rewards/margins": 0.58203125, + "rewards/rejected": -4.6875, + "step": 490 + }, + { + "epoch": 0.2616431187859759, + "grad_norm": 6.290512673617353, + "learning_rate": 4.3554006968641115e-05, + "logits/chosen": -8.0625, + "logits/rejected": -7.84375, + "logps/chosen": -402.0, + "logps/rejected": -386.0, + "loss": 0.6275, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.5, + "rewards/margins": 0.59765625, + "rewards/rejected": -5.09375, + "step": 500 + }, + { + "epoch": 0.2668759811616955, + "grad_norm": 8.840142113315412, + "learning_rate": 4.442508710801394e-05, + "logits/chosen": -9.0, + "logits/rejected": -7.96875, + "logps/chosen": -408.0, + "logps/rejected": -378.0, + "loss": 0.6852, + "rewards/accuracies": 0.625, + "rewards/chosen": -4.5, + "rewards/margins": 0.447265625, + "rewards/rejected": -4.9375, + "step": 510 + }, + { + "epoch": 0.272108843537415, + "grad_norm": 7.427335145965296, + "learning_rate": 4.529616724738676e-05, + "logits/chosen": -9.625, + "logits/rejected": -9.4375, + "logps/chosen": -442.0, + "logps/rejected": -448.0, + "loss": 0.7685, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.21875, + "rewards/margins": 0.435546875, + "rewards/rejected": -4.65625, + "step": 520 + }, + { + "epoch": 0.2773417059131345, + "grad_norm": 6.490058212952471, + "learning_rate": 4.616724738675958e-05, + "logits/chosen": -8.625, + "logits/rejected": -8.1875, + "logps/chosen": -460.0, + "logps/rejected": -384.0, + "loss": 0.7412, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -4.75, + "rewards/margins": 0.62890625, + "rewards/rejected": -5.375, + "step": 530 + }, + { + "epoch": 0.282574568288854, + "grad_norm": 12.243471295986348, + "learning_rate": 4.703832752613241e-05, + "logits/chosen": -8.375, + "logits/rejected": -7.6875, + "logps/chosen": -458.0, + "logps/rejected": -432.0, + "loss": 0.6623, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -5.4375, + "rewards/margins": 1.0, + "rewards/rejected": -6.4375, + "step": 540 + }, + { + "epoch": 0.28780743066457354, + "grad_norm": 8.28734090693043, + "learning_rate": 4.7909407665505226e-05, + "logits/chosen": -8.25, + "logits/rejected": -7.375, + "logps/chosen": -460.0, + "logps/rejected": -432.0, + "loss": 0.8084, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -4.53125, + "rewards/margins": 0.455078125, + "rewards/rejected": -5.0, + "step": 550 + }, + { + "epoch": 0.29304029304029305, + "grad_norm": 8.12376561727762, + "learning_rate": 4.878048780487805e-05, + "logits/chosen": -7.53125, + "logits/rejected": -7.125, + "logps/chosen": -416.0, + "logps/rejected": -404.0, + "loss": 0.7299, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -4.21875, + "rewards/margins": 0.5234375, + "rewards/rejected": -4.75, + "step": 560 + }, + { + "epoch": 0.29827315541601257, + "grad_norm": 12.391849545439836, + "learning_rate": 4.965156794425087e-05, + "logits/chosen": -8.5, + "logits/rejected": -7.5625, + "logps/chosen": -440.0, + "logps/rejected": -394.0, + "loss": 0.8111, + "rewards/accuracies": 0.5625, + "rewards/chosen": -5.03125, + "rewards/margins": -0.00567626953125, + "rewards/rejected": -5.0, + "step": 570 + }, + { + "epoch": 0.3035060177917321, + "grad_norm": 10.880713710599428, + "learning_rate": 4.999983312905697e-05, + "logits/chosen": -7.40625, + "logits/rejected": -6.4375, + "logps/chosen": -408.0, + "logps/rejected": -356.0, + "loss": 0.6451, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -5.0, + "rewards/margins": 0.73046875, + "rewards/rejected": -5.75, + "step": 580 + }, + { + "epoch": 0.3087388801674516, + "grad_norm": 19.233804877733014, + "learning_rate": 4.9998813370250145e-05, + "logits/chosen": -5.1875, + "logits/rejected": -4.53125, + "logps/chosen": -436.0, + "logps/rejected": -418.0, + "loss": 0.7623, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -6.65625, + "rewards/margins": 0.5859375, + "rewards/rejected": -7.21875, + "step": 590 + }, + { + "epoch": 0.3139717425431711, + "grad_norm": 6.44253234493605, + "learning_rate": 4.999686659648518e-05, + "logits/chosen": -5.59375, + "logits/rejected": -5.5, + "logps/chosen": -466.0, + "logps/rejected": -468.0, + "loss": 0.7429, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.375, + "rewards/margins": 0.47265625, + "rewards/rejected": -6.84375, + "step": 600 + }, + { + "epoch": 0.31920460491889063, + "grad_norm": 12.099417587710867, + "learning_rate": 4.999399287995303e-05, + "logits/chosen": -6.875, + "logits/rejected": -5.75, + "logps/chosen": -390.0, + "logps/rejected": -366.0, + "loss": 0.7365, + "rewards/accuracies": 0.625, + "rewards/chosen": -5.0625, + "rewards/margins": 0.5546875, + "rewards/rejected": -5.625, + "step": 610 + }, + { + "epoch": 0.32443746729461015, + "grad_norm": 8.715383868761732, + "learning_rate": 4.9990192327217914e-05, + "logits/chosen": -8.1875, + "logits/rejected": -5.96875, + "logps/chosen": -520.0, + "logps/rejected": -390.0, + "loss": 0.7497, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.0, + "rewards/margins": 0.88671875, + "rewards/rejected": -6.875, + "step": 620 + }, + { + "epoch": 0.32967032967032966, + "grad_norm": 8.279369051319126, + "learning_rate": 4.998546507921325e-05, + "logits/chosen": -6.25, + "logits/rejected": -5.90625, + "logps/chosen": -384.0, + "logps/rejected": -406.0, + "loss": 0.8175, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.25, + "rewards/margins": 0.388671875, + "rewards/rejected": -6.65625, + "step": 630 + }, + { + "epoch": 0.3349031920460492, + "grad_norm": 9.292602707728573, + "learning_rate": 4.997981131123657e-05, + "logits/chosen": -7.15625, + "logits/rejected": -6.53125, + "logps/chosen": -444.0, + "logps/rejected": -418.0, + "loss": 0.7555, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -5.71875, + "rewards/margins": 0.62109375, + "rewards/rejected": -6.34375, + "step": 640 + }, + { + "epoch": 0.3401360544217687, + "grad_norm": 7.05082008534396, + "learning_rate": 4.9973231232942906e-05, + "logits/chosen": -7.21875, + "logits/rejected": -6.625, + "logps/chosen": -434.0, + "logps/rejected": -398.0, + "loss": 0.6927, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -5.15625, + "rewards/margins": 0.578125, + "rewards/rejected": -5.71875, + "step": 650 + }, + { + "epoch": 0.3453689167974882, + "grad_norm": 9.395818298567765, + "learning_rate": 4.9965725088337103e-05, + "logits/chosen": -5.9375, + "logits/rejected": -5.21875, + "logps/chosen": -378.0, + "logps/rejected": -368.0, + "loss": 0.7448, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -5.34375, + "rewards/margins": 0.62109375, + "rewards/rejected": -5.96875, + "step": 660 + }, + { + "epoch": 0.35060177917320773, + "grad_norm": 11.966195250971206, + "learning_rate": 4.995729315576468e-05, + "logits/chosen": -6.25, + "logits/rejected": -4.84375, + "logps/chosen": -440.0, + "logps/rejected": -416.0, + "loss": 0.6844, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -6.5, + "rewards/margins": 0.66015625, + "rewards/rejected": -7.15625, + "step": 670 + }, + { + "epoch": 0.35583464154892724, + "grad_norm": 8.22871148505434, + "learning_rate": 4.994793574790161e-05, + "logits/chosen": -6.125, + "logits/rejected": -5.5, + "logps/chosen": -448.0, + "logps/rejected": -410.0, + "loss": 0.6867, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -6.625, + "rewards/margins": 0.59375, + "rewards/rejected": -7.1875, + "step": 680 + }, + { + "epoch": 0.36106750392464676, + "grad_norm": 9.281604668745217, + "learning_rate": 4.993765321174262e-05, + "logits/chosen": -3.90625, + "logits/rejected": -2.71875, + "logps/chosen": -512.0, + "logps/rejected": -480.0, + "loss": 0.7281, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -8.25, + "rewards/margins": 0.98046875, + "rewards/rejected": -9.25, + "step": 690 + }, + { + "epoch": 0.3663003663003663, + "grad_norm": 9.358944680779736, + "learning_rate": 4.992644592858842e-05, + "logits/chosen": -6.65625, + "logits/rejected": -5.9375, + "logps/chosen": -482.0, + "logps/rejected": -422.0, + "loss": 0.7949, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -6.75, + "rewards/margins": 0.27734375, + "rewards/rejected": -7.03125, + "step": 700 + }, + { + "epoch": 0.3715332286760858, + "grad_norm": 14.819279276785178, + "learning_rate": 4.9914314314031484e-05, + "logits/chosen": -7.03125, + "logits/rejected": -6.0, + "logps/chosen": -500.0, + "logps/rejected": -464.0, + "loss": 0.7073, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -7.15625, + "rewards/margins": 0.97265625, + "rewards/rejected": -8.125, + "step": 710 + }, + { + "epoch": 0.37676609105180536, + "grad_norm": 9.493964933081799, + "learning_rate": 4.990125881794071e-05, + "logits/chosen": -5.5, + "logits/rejected": -5.1875, + "logps/chosen": -430.0, + "logps/rejected": -428.0, + "loss": 0.8044, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -6.96875, + "rewards/margins": 0.55859375, + "rewards/rejected": -7.53125, + "step": 720 + }, + { + "epoch": 0.3819989534275249, + "grad_norm": 9.549194684405798, + "learning_rate": 4.988727992444467e-05, + "logits/chosen": -6.875, + "logits/rejected": -6.375, + "logps/chosen": -448.0, + "logps/rejected": -436.0, + "loss": 0.7151, + "rewards/accuracies": 0.625, + "rewards/chosen": -6.1875, + "rewards/margins": 0.69140625, + "rewards/rejected": -6.875, + "step": 730 + }, + { + "epoch": 0.3872318158032444, + "grad_norm": 11.119599516244255, + "learning_rate": 4.987237815191371e-05, + "logits/chosen": -6.5, + "logits/rejected": -5.9375, + "logps/chosen": -442.0, + "logps/rejected": -404.0, + "loss": 0.6481, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -6.875, + "rewards/margins": 0.78125, + "rewards/rejected": -7.65625, + "step": 740 + }, + { + "epoch": 0.3924646781789639, + "grad_norm": 29.084911099228044, + "learning_rate": 4.9856554052940705e-05, + "logits/chosen": -3.890625, + "logits/rejected": -3.40625, + "logps/chosen": -476.0, + "logps/rejected": -496.0, + "loss": 0.7091, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -8.4375, + "rewards/margins": 0.86328125, + "rewards/rejected": -9.3125, + "step": 750 + }, + { + "epoch": 0.3976975405546834, + "grad_norm": 11.3604312102586, + "learning_rate": 4.983980821432055e-05, + "logits/chosen": -3.890625, + "logits/rejected": -2.84375, + "logps/chosen": -442.0, + "logps/rejected": -424.0, + "loss": 0.7856, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -7.40625, + "rewards/margins": 0.63671875, + "rewards/rejected": -8.0, + "step": 760 + }, + { + "epoch": 0.40293040293040294, + "grad_norm": 7.746503351769591, + "learning_rate": 4.982214125702845e-05, + "logits/chosen": -4.84375, + "logits/rejected": -4.65625, + "logps/chosen": -482.0, + "logps/rejected": -488.0, + "loss": 1.0012, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.375, + "rewards/margins": 0.56640625, + "rewards/rejected": -7.9375, + "step": 770 + }, + { + "epoch": 0.40816326530612246, + "grad_norm": 11.766587625625025, + "learning_rate": 4.9803553836196845e-05, + "logits/chosen": -3.484375, + "logits/rejected": -1.890625, + "logps/chosen": -474.0, + "logps/rejected": -444.0, + "loss": 0.6736, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -7.5, + "rewards/margins": 0.578125, + "rewards/rejected": -8.0625, + "step": 780 + }, + { + "epoch": 0.413396127681842, + "grad_norm": 9.097797617055331, + "learning_rate": 4.978404664109113e-05, + "logits/chosen": -4.21875, + "logits/rejected": -3.21875, + "logps/chosen": -426.0, + "logps/rejected": -448.0, + "loss": 0.7096, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -7.4375, + "rewards/margins": 1.0703125, + "rewards/rejected": -8.5, + "step": 790 + }, + { + "epoch": 0.4186289900575615, + "grad_norm": 11.540849585926436, + "learning_rate": 4.976362039508411e-05, + "logits/chosen": -8.1875, + "logits/rejected": -7.46875, + "logps/chosen": -504.0, + "logps/rejected": -472.0, + "loss": 0.7436, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -6.84375, + "rewards/margins": 0.609375, + "rewards/rejected": -7.46875, + "step": 800 + }, + { + "epoch": 0.423861852433281, + "grad_norm": 10.390351331388345, + "learning_rate": 4.9742275855629164e-05, + "logits/chosen": -7.75, + "logits/rejected": -7.0, + "logps/chosen": -468.0, + "logps/rejected": -446.0, + "loss": 0.7135, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -6.6875, + "rewards/margins": 0.7890625, + "rewards/rejected": -7.5, + "step": 810 + }, + { + "epoch": 0.4290947148090005, + "grad_norm": 10.670994286644559, + "learning_rate": 4.9720013814232146e-05, + "logits/chosen": -4.75, + "logits/rejected": -3.453125, + "logps/chosen": -492.0, + "logps/rejected": -454.0, + "loss": 0.7545, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -8.75, + "rewards/margins": 0.61328125, + "rewards/rejected": -9.3125, + "step": 820 + }, + { + "epoch": 0.43432757718472004, + "grad_norm": 10.630956069066174, + "learning_rate": 4.969683509642207e-05, + "logits/chosen": -5.90625, + "logits/rejected": -4.96875, + "logps/chosen": -456.0, + "logps/rejected": -436.0, + "loss": 0.8482, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -8.1875, + "rewards/margins": 0.24609375, + "rewards/rejected": -8.4375, + "step": 830 + }, + { + "epoch": 0.43956043956043955, + "grad_norm": 8.101257247067318, + "learning_rate": 4.967274056172044e-05, + "logits/chosen": -10.875, + "logits/rejected": -10.5, + "logps/chosen": -520.0, + "logps/rejected": -462.0, + "loss": 0.7421, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -6.8125, + "rewards/margins": 0.91015625, + "rewards/rejected": -7.71875, + "step": 840 + }, + { + "epoch": 0.44479330193615907, + "grad_norm": 10.944762300383431, + "learning_rate": 4.964773110360944e-05, + "logits/chosen": -10.75, + "logits/rejected": -10.875, + "logps/chosen": -456.0, + "logps/rejected": -416.0, + "loss": 0.8146, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -7.375, + "rewards/margins": 0.6171875, + "rewards/rejected": -8.0, + "step": 850 + }, + { + "epoch": 0.4500261643118786, + "grad_norm": 8.913339988594448, + "learning_rate": 4.9621807649498764e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.125, + "logps/chosen": -432.0, + "logps/rejected": -472.0, + "loss": 0.6892, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -6.4375, + "rewards/margins": 0.8046875, + "rewards/rejected": -7.25, + "step": 860 + }, + { + "epoch": 0.4552590266875981, + "grad_norm": 7.134016274199884, + "learning_rate": 4.9594971160691226e-05, + "logits/chosen": -10.5625, + "logits/rejected": -10.4375, + "logps/chosen": -466.0, + "logps/rejected": -426.0, + "loss": 0.7858, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -6.84375, + "rewards/margins": 0.5390625, + "rewards/rejected": -7.40625, + "step": 870 + }, + { + "epoch": 0.4604918890633176, + "grad_norm": 10.328569383276303, + "learning_rate": 4.9567222632347116e-05, + "logits/chosen": -11.3125, + "logits/rejected": -11.5, + "logps/chosen": -520.0, + "logps/rejected": -458.0, + "loss": 0.6878, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -8.125, + "rewards/margins": 0.828125, + "rewards/rejected": -9.0, + "step": 880 + }, + { + "epoch": 0.46572475143903713, + "grad_norm": 9.62796454911049, + "learning_rate": 4.953856309344731e-05, + "logits/chosen": -11.625, + "logits/rejected": -11.6875, + "logps/chosen": -540.0, + "logps/rejected": -496.0, + "loss": 0.7736, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -8.875, + "rewards/margins": 0.609375, + "rewards/rejected": -9.5, + "step": 890 + }, + { + "epoch": 0.47095761381475665, + "grad_norm": 10.482736030956378, + "learning_rate": 4.9508993606755115e-05, + "logits/chosen": -10.9375, + "logits/rejected": -10.875, + "logps/chosen": -460.0, + "logps/rejected": -488.0, + "loss": 0.8452, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -7.09375, + "rewards/margins": 1.1328125, + "rewards/rejected": -8.1875, + "step": 900 + }, + { + "epoch": 0.47619047619047616, + "grad_norm": 12.475233890425232, + "learning_rate": 4.947851526877682e-05, + "logits/chosen": -12.5625, + "logits/rejected": -12.625, + "logps/chosen": -424.0, + "logps/rejected": -410.0, + "loss": 0.8266, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -7.40625, + "rewards/margins": 0.734375, + "rewards/rejected": -8.125, + "step": 910 + }, + { + "epoch": 0.48142333856619574, + "grad_norm": 9.454823173352347, + "learning_rate": 4.944712920972109e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.625, + "logps/chosen": -494.0, + "logps/rejected": -462.0, + "loss": 0.6861, + "rewards/accuracies": 0.5625, + "rewards/chosen": -7.6875, + "rewards/margins": 0.3359375, + "rewards/rejected": -8.0, + "step": 920 + }, + { + "epoch": 0.48665620094191525, + "grad_norm": 18.342099499785732, + "learning_rate": 4.9414836593457004e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.25, + "logps/chosen": -488.0, + "logps/rejected": -452.0, + "loss": 0.6957, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -7.71875, + "rewards/margins": 0.8125, + "rewards/rejected": -8.5625, + "step": 930 + }, + { + "epoch": 0.49188906331763477, + "grad_norm": 7.4614916483387335, + "learning_rate": 4.938163861747095e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.875, + "logps/chosen": -488.0, + "logps/rejected": -446.0, + "loss": 0.6317, + "rewards/accuracies": 0.625, + "rewards/chosen": -7.8125, + "rewards/margins": 0.890625, + "rewards/rejected": -8.6875, + "step": 940 + }, + { + "epoch": 0.4971219256933543, + "grad_norm": 10.561175013265123, + "learning_rate": 4.934753651282216e-05, + "logits/chosen": -13.0625, + "logits/rejected": -13.5, + "logps/chosen": -468.0, + "logps/rejected": -454.0, + "loss": 0.7105, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -7.34375, + "rewards/margins": 0.8203125, + "rewards/rejected": -8.1875, + "step": 950 + }, + { + "epoch": 0.5023547880690737, + "grad_norm": 8.854443288097167, + "learning_rate": 4.9312531544097107e-05, + "logits/chosen": -13.875, + "logits/rejected": -14.25, + "logps/chosen": -484.0, + "logps/rejected": -484.0, + "loss": 0.657, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.5625, + "rewards/margins": 1.4296875, + "rewards/rejected": -10.0, + "step": 960 + }, + { + "epoch": 0.5075876504447933, + "grad_norm": 14.652422394907061, + "learning_rate": 4.92766250093626e-05, + "logits/chosen": -13.0625, + "logits/rejected": -13.625, + "logps/chosen": -552.0, + "logps/rejected": -488.0, + "loss": 0.7733, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.1875, + "rewards/margins": 1.4453125, + "rewards/rejected": -10.625, + "step": 970 + }, + { + "epoch": 0.5128205128205128, + "grad_norm": 7.681622448151709, + "learning_rate": 4.923981824011761e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.625, + "logps/chosen": -544.0, + "logps/rejected": -500.0, + "loss": 0.6913, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -8.375, + "rewards/margins": 1.5546875, + "rewards/rejected": -9.875, + "step": 980 + }, + { + "epoch": 0.5180533751962323, + "grad_norm": 9.667593919000552, + "learning_rate": 4.9202112601243956e-05, + "logits/chosen": -12.875, + "logits/rejected": -13.5, + "logps/chosen": -480.0, + "logps/rejected": -440.0, + "loss": 0.7261, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.375, + "rewards/margins": 0.8671875, + "rewards/rejected": -9.25, + "step": 990 + }, + { + "epoch": 0.5232862375719518, + "grad_norm": 9.808142730894566, + "learning_rate": 4.916350949095566e-05, + "logits/chosen": -14.375, + "logits/rejected": -14.625, + "logps/chosen": -472.0, + "logps/rejected": -452.0, + "loss": 0.7067, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.8125, + "rewards/margins": 0.78125, + "rewards/rejected": -9.625, + "step": 1000 + }, + { + "epoch": 0.5285190999476713, + "grad_norm": 9.81071285242805, + "learning_rate": 4.9124010340747084e-05, + "logits/chosen": -14.25, + "logits/rejected": -14.3125, + "logps/chosen": -512.0, + "logps/rejected": -516.0, + "loss": 0.7869, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -8.4375, + "rewards/margins": 0.8359375, + "rewards/rejected": -9.25, + "step": 1010 + }, + { + "epoch": 0.533751962323391, + "grad_norm": 9.281663389122608, + "learning_rate": 4.908361661533989e-05, + "logits/chosen": -14.1875, + "logits/rejected": -14.5625, + "logps/chosen": -510.0, + "logps/rejected": -474.0, + "loss": 0.7217, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.875, + "rewards/margins": 1.4140625, + "rewards/rejected": -10.25, + "step": 1020 + }, + { + "epoch": 0.5389848246991105, + "grad_norm": 6.703363126668062, + "learning_rate": 4.904232981262866e-05, + "logits/chosen": -13.75, + "logits/rejected": -14.0625, + "logps/chosen": -528.0, + "logps/rejected": -464.0, + "loss": 0.7286, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -9.4375, + "rewards/margins": 0.65234375, + "rewards/rejected": -10.0625, + "step": 1030 + }, + { + "epoch": 0.54421768707483, + "grad_norm": 7.751957354860659, + "learning_rate": 4.900015146362544e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.6875, + "logps/chosen": -478.0, + "logps/rejected": -508.0, + "loss": 0.7434, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -8.5625, + "rewards/margins": 1.0078125, + "rewards/rejected": -9.5625, + "step": 1040 + }, + { + "epoch": 0.5494505494505495, + "grad_norm": 9.518426255540042, + "learning_rate": 4.895708313240286e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.5625, + "logps/chosen": -476.0, + "logps/rejected": -478.0, + "loss": 0.9173, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -8.0, + "rewards/margins": 0.8359375, + "rewards/rejected": -8.875, + "step": 1050 + }, + { + "epoch": 0.554683411826269, + "grad_norm": 9.615827437190525, + "learning_rate": 4.891312641603623e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.875, + "logps/chosen": -492.0, + "logps/rejected": -460.0, + "loss": 0.7663, + "rewards/accuracies": 0.6875, + "rewards/chosen": -8.25, + "rewards/margins": 0.6953125, + "rewards/rejected": -8.9375, + "step": 1060 + }, + { + "epoch": 0.5599162742019885, + "grad_norm": 7.981258184209591, + "learning_rate": 4.8868282944544266e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.8125, + "logps/chosen": -516.0, + "logps/rejected": -482.0, + "loss": 0.6175, + "rewards/accuracies": 0.6875, + "rewards/chosen": -7.28125, + "rewards/margins": 0.8984375, + "rewards/rejected": -8.1875, + "step": 1070 + }, + { + "epoch": 0.565149136577708, + "grad_norm": 9.025058114682, + "learning_rate": 4.882255438082863e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.9375, + "logps/chosen": -508.0, + "logps/rejected": -472.0, + "loss": 0.7627, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.0, + "rewards/margins": 0.9375, + "rewards/rejected": -9.875, + "step": 1080 + }, + { + "epoch": 0.5703819989534276, + "grad_norm": 15.956761347357418, + "learning_rate": 4.877594242061234e-05, + "logits/chosen": -11.375, + "logits/rejected": -11.875, + "logps/chosen": -540.0, + "logps/rejected": -466.0, + "loss": 0.9185, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -10.0625, + "rewards/margins": 0.76953125, + "rewards/rejected": -10.8125, + "step": 1090 + }, + { + "epoch": 0.5756148613291471, + "grad_norm": 11.243288973766504, + "learning_rate": 4.87284487923768e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.875, + "logps/chosen": -474.0, + "logps/rejected": -450.0, + "loss": 0.7132, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -8.9375, + "rewards/margins": 0.58984375, + "rewards/rejected": -9.5, + "step": 1100 + }, + { + "epoch": 0.5808477237048666, + "grad_norm": 9.173477529825552, + "learning_rate": 4.868007525729775e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.6875, + "logps/chosen": -466.0, + "logps/rejected": -456.0, + "loss": 0.8585, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -8.5, + "rewards/margins": 0.41796875, + "rewards/rejected": -8.9375, + "step": 1110 + }, + { + "epoch": 0.5860805860805861, + "grad_norm": 8.587651387175145, + "learning_rate": 4.8630823609179975e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.6875, + "logps/chosen": -560.0, + "logps/rejected": -492.0, + "loss": 0.9801, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -9.375, + "rewards/margins": 0.0250244140625, + "rewards/rejected": -9.375, + "step": 1120 + }, + { + "epoch": 0.5913134484563056, + "grad_norm": 10.121602680832632, + "learning_rate": 4.858069567439073e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.1875, + "logps/chosen": -472.0, + "logps/rejected": -486.0, + "loss": 0.8725, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -8.125, + "rewards/margins": 0.7734375, + "rewards/rejected": -8.875, + "step": 1130 + }, + { + "epoch": 0.5965463108320251, + "grad_norm": 6.012474856608194, + "learning_rate": 4.852969331179206e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.9375, + "logps/chosen": -552.0, + "logps/rejected": -500.0, + "loss": 0.7115, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.875, + "rewards/margins": 0.671875, + "rewards/rejected": -10.5625, + "step": 1140 + }, + { + "epoch": 0.6017791732077447, + "grad_norm": 9.660457794042633, + "learning_rate": 4.847781841267186e-05, + "logits/chosen": -12.75, + "logits/rejected": -13.125, + "logps/chosen": -506.0, + "logps/rejected": -496.0, + "loss": 0.6596, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -10.125, + "rewards/margins": 1.3828125, + "rewards/rejected": -11.5, + "step": 1150 + }, + { + "epoch": 0.6070120355834642, + "grad_norm": 7.623993751180003, + "learning_rate": 4.842507290067374e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.8125, + "logps/chosen": -516.0, + "logps/rejected": -450.0, + "loss": 0.7645, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.0625, + "rewards/margins": 0.77734375, + "rewards/rejected": -11.875, + "step": 1160 + }, + { + "epoch": 0.6122448979591837, + "grad_norm": 8.389337355599595, + "learning_rate": 4.8371458731725676e-05, + "logits/chosen": -12.6875, + "logits/rejected": -13.0, + "logps/chosen": -532.0, + "logps/rejected": -544.0, + "loss": 0.6616, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -10.25, + "rewards/margins": 0.8125, + "rewards/rejected": -11.125, + "step": 1170 + }, + { + "epoch": 0.6174777603349032, + "grad_norm": 6.931265574049484, + "learning_rate": 4.83169778939675e-05, + "logits/chosen": -12.8125, + "logits/rejected": -13.4375, + "logps/chosen": -528.0, + "logps/rejected": -494.0, + "loss": 0.6737, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -9.9375, + "rewards/margins": 0.8828125, + "rewards/rejected": -10.8125, + "step": 1180 + }, + { + "epoch": 0.6227106227106227, + "grad_norm": 8.050651620193062, + "learning_rate": 4.8261632407677174e-05, + "logits/chosen": -13.5625, + "logits/rejected": -14.0, + "logps/chosen": -512.0, + "logps/rejected": -498.0, + "loss": 0.5961, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -9.875, + "rewards/margins": 0.953125, + "rewards/rejected": -10.875, + "step": 1190 + }, + { + "epoch": 0.6279434850863422, + "grad_norm": 6.392594343316606, + "learning_rate": 4.820542432519583e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.5, + "logps/chosen": -506.0, + "logps/rejected": -512.0, + "loss": 0.6625, + "rewards/accuracies": 0.75, + "rewards/chosen": -9.5, + "rewards/margins": 1.4140625, + "rewards/rejected": -10.9375, + "step": 1200 + }, + { + "epoch": 0.6331763474620618, + "grad_norm": 8.843935462259351, + "learning_rate": 4.814835573085177e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.6875, + "logps/chosen": -564.0, + "logps/rejected": -494.0, + "loss": 0.7504, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.875, + "rewards/margins": 1.2734375, + "rewards/rejected": -11.125, + "step": 1210 + }, + { + "epoch": 0.6384092098377813, + "grad_norm": 10.684751793528747, + "learning_rate": 4.809042874088304e-05, + "logits/chosen": -12.8125, + "logits/rejected": -13.3125, + "logps/chosen": -544.0, + "logps/rejected": -494.0, + "loss": 0.805, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.25, + "rewards/margins": 0.921875, + "rewards/rejected": -10.1875, + "step": 1220 + }, + { + "epoch": 0.6436420722135008, + "grad_norm": 8.122057886743049, + "learning_rate": 4.803164550335906e-05, + "logits/chosen": -12.5, + "logits/rejected": -13.0, + "logps/chosen": -540.0, + "logps/rejected": -496.0, + "loss": 0.7183, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -8.625, + "rewards/margins": 2.03125, + "rewards/rejected": -10.6875, + "step": 1230 + }, + { + "epoch": 0.6488749345892203, + "grad_norm": 8.217900203907949, + "learning_rate": 4.79720081981009e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.5, + "logps/chosen": -450.0, + "logps/rejected": -450.0, + "loss": 0.7273, + "rewards/accuracies": 0.625, + "rewards/chosen": -8.875, + "rewards/margins": 0.859375, + "rewards/rejected": -9.75, + "step": 1240 + }, + { + "epoch": 0.6541077969649398, + "grad_norm": 9.65521490229528, + "learning_rate": 4.79115190366005e-05, + "logits/chosen": -12.5625, + "logits/rejected": -12.6875, + "logps/chosen": -520.0, + "logps/rejected": -510.0, + "loss": 0.7829, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -10.1875, + "rewards/margins": 0.9375, + "rewards/rejected": -11.125, + "step": 1250 + }, + { + "epoch": 0.6593406593406593, + "grad_norm": 6.471532717972355, + "learning_rate": 4.785018026193863e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.6875, + "logps/chosen": -520.0, + "logps/rejected": -468.0, + "loss": 0.6638, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -9.6875, + "rewards/margins": 1.4375, + "rewards/rejected": -11.125, + "step": 1260 + }, + { + "epoch": 0.6645735217163788, + "grad_norm": 8.558298845858685, + "learning_rate": 4.778799414870171e-05, + "logits/chosen": -11.875, + "logits/rejected": -12.3125, + "logps/chosen": -520.0, + "logps/rejected": -490.0, + "loss": 0.8021, + "rewards/accuracies": 0.625, + "rewards/chosen": -9.9375, + "rewards/margins": 0.9453125, + "rewards/rejected": -10.875, + "step": 1270 + }, + { + "epoch": 0.6698063840920984, + "grad_norm": 10.859527406335348, + "learning_rate": 4.772496300289748e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.0, + "logps/chosen": -502.0, + "logps/rejected": -464.0, + "loss": 0.7863, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -9.4375, + "rewards/margins": 0.94140625, + "rewards/rejected": -10.375, + "step": 1280 + }, + { + "epoch": 0.6750392464678179, + "grad_norm": 12.60115094547818, + "learning_rate": 4.76610891618695e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.875, + "logps/chosen": -486.0, + "logps/rejected": -504.0, + "loss": 0.7492, + "rewards/accuracies": 0.5625, + "rewards/chosen": -9.1875, + "rewards/margins": 0.75390625, + "rewards/rejected": -9.9375, + "step": 1290 + }, + { + "epoch": 0.6802721088435374, + "grad_norm": 8.716034881986472, + "learning_rate": 4.7596374994210424e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.3125, + "logps/chosen": -504.0, + "logps/rejected": -520.0, + "loss": 0.7127, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -9.4375, + "rewards/margins": 1.015625, + "rewards/rejected": -10.4375, + "step": 1300 + }, + { + "epoch": 0.6855049712192569, + "grad_norm": 9.364442782095258, + "learning_rate": 4.753082289967421e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.375, + "logps/chosen": -544.0, + "logps/rejected": -502.0, + "loss": 0.7904, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -9.875, + "rewards/margins": 0.7578125, + "rewards/rejected": -10.625, + "step": 1310 + }, + { + "epoch": 0.6907378335949764, + "grad_norm": 6.165484242150408, + "learning_rate": 4.746443530908714e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.5, + "logps/chosen": -548.0, + "logps/rejected": -536.0, + "loss": 0.7193, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.9375, + "rewards/margins": 1.0546875, + "rewards/rejected": -12.0, + "step": 1320 + }, + { + "epoch": 0.6959706959706959, + "grad_norm": 10.533560780263379, + "learning_rate": 4.7397214684257636e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.625, + "logps/chosen": -552.0, + "logps/rejected": -584.0, + "loss": 0.7145, + "rewards/accuracies": 0.6875, + "rewards/chosen": -10.3125, + "rewards/margins": 1.25, + "rewards/rejected": -11.5625, + "step": 1330 + }, + { + "epoch": 0.7012035583464155, + "grad_norm": 8.838646171009014, + "learning_rate": 4.7329163517885e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.6875, + "logps/chosen": -548.0, + "logps/rejected": -510.0, + "loss": 0.7471, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": -10.125, + "rewards/margins": 0.453125, + "rewards/rejected": -10.625, + "step": 1340 + }, + { + "epoch": 0.706436420722135, + "grad_norm": 7.05986771230505, + "learning_rate": 4.726028433346697e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.5, + "logps/chosen": -584.0, + "logps/rejected": -576.0, + "loss": 0.7426, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -11.125, + "rewards/margins": 1.1171875, + "rewards/rejected": -12.25, + "step": 1350 + }, + { + "epoch": 0.7116692830978545, + "grad_norm": 9.672518916674647, + "learning_rate": 4.7190579685206175e-05, + "logits/chosen": -12.5625, + "logits/rejected": -12.75, + "logps/chosen": -620.0, + "logps/rejected": -588.0, + "loss": 0.9889, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -11.9375, + "rewards/margins": 0.875, + "rewards/rejected": -12.8125, + "step": 1360 + }, + { + "epoch": 0.716902145473574, + "grad_norm": 11.391966596496053, + "learning_rate": 4.712005215791535e-05, + "logits/chosen": -11.9375, + "logits/rejected": -12.1875, + "logps/chosen": -564.0, + "logps/rejected": -510.0, + "loss": 0.6305, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -11.25, + "rewards/margins": 0.6015625, + "rewards/rejected": -11.8125, + "step": 1370 + }, + { + "epoch": 0.7221350078492935, + "grad_norm": 6.417146855452024, + "learning_rate": 4.704870436692154e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.6875, + "logps/chosen": -504.0, + "logps/rejected": -486.0, + "loss": 0.6748, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -9.8125, + "rewards/margins": 1.7109375, + "rewards/rejected": -11.5625, + "step": 1380 + }, + { + "epoch": 0.727367870225013, + "grad_norm": 8.005396824784661, + "learning_rate": 4.697653895796912e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.1875, + "logps/chosen": -502.0, + "logps/rejected": -498.0, + "loss": 0.6587, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -10.25, + "rewards/margins": 1.2734375, + "rewards/rejected": -11.5, + "step": 1390 + }, + { + "epoch": 0.7326007326007326, + "grad_norm": 9.511953260913776, + "learning_rate": 4.6903558607121634e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.75, + "logps/chosen": -520.0, + "logps/rejected": -508.0, + "loss": 0.7308, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -10.875, + "rewards/margins": 1.1015625, + "rewards/rejected": -12.0, + "step": 1400 + }, + { + "epoch": 0.7378335949764521, + "grad_norm": 9.796525964563228, + "learning_rate": 4.682976602066263e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.3125, + "logps/chosen": -516.0, + "logps/rejected": -508.0, + "loss": 0.8031, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.9375, + "rewards/margins": 0.76171875, + "rewards/rejected": -11.6875, + "step": 1410 + }, + { + "epoch": 0.7430664573521716, + "grad_norm": 10.015711312302077, + "learning_rate": 4.6755163934995226e-05, + "logits/chosen": -11.0625, + "logits/rejected": -11.375, + "logps/chosen": -486.0, + "logps/rejected": -468.0, + "loss": 0.8148, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -8.9375, + "rewards/margins": 1.078125, + "rewards/rejected": -10.0, + "step": 1420 + }, + { + "epoch": 0.7482993197278912, + "grad_norm": 6.246681815455791, + "learning_rate": 4.6679755116540726e-05, + "logits/chosen": -11.9375, + "logits/rejected": -12.1875, + "logps/chosen": -510.0, + "logps/rejected": -462.0, + "loss": 0.7297, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -9.5625, + "rewards/margins": 0.2470703125, + "rewards/rejected": -9.875, + "step": 1430 + }, + { + "epoch": 0.7535321821036107, + "grad_norm": 9.65158699681941, + "learning_rate": 4.660354236163596e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.5625, + "logps/chosen": -568.0, + "logps/rejected": -520.0, + "loss": 0.7922, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.0625, + "rewards/margins": 0.9921875, + "rewards/rejected": -11.0625, + "step": 1440 + }, + { + "epoch": 0.7587650444793302, + "grad_norm": 8.442087483469699, + "learning_rate": 4.652652849642961e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.875, + "logps/chosen": -540.0, + "logps/rejected": -520.0, + "loss": 0.7822, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -10.9375, + "rewards/margins": 0.66015625, + "rewards/rejected": -11.5625, + "step": 1450 + }, + { + "epoch": 0.7639979068550498, + "grad_norm": 8.897264080440657, + "learning_rate": 4.644871637677746e-05, + "logits/chosen": -13.0, + "logits/rejected": -13.0, + "logps/chosen": -516.0, + "logps/rejected": -472.0, + "loss": 0.7131, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -10.125, + "rewards/margins": 0.57421875, + "rewards/rejected": -10.75, + "step": 1460 + }, + { + "epoch": 0.7692307692307693, + "grad_norm": 6.696938194802173, + "learning_rate": 4.637010888813639e-05, + "logits/chosen": -13.125, + "logits/rejected": -13.3125, + "logps/chosen": -504.0, + "logps/rejected": -480.0, + "loss": 0.6593, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -9.75, + "rewards/margins": 1.0078125, + "rewards/rejected": -10.75, + "step": 1470 + }, + { + "epoch": 0.7744636316064888, + "grad_norm": 8.791582206884076, + "learning_rate": 4.6290708945457494e-05, + "logits/chosen": -13.0625, + "logits/rejected": -13.0, + "logps/chosen": -512.0, + "logps/rejected": -512.0, + "loss": 0.8411, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -11.3125, + "rewards/margins": 0.77734375, + "rewards/rejected": -12.0625, + "step": 1480 + }, + { + "epoch": 0.7796964939822083, + "grad_norm": 10.696923852598411, + "learning_rate": 4.6210519493077895e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.8125, + "logps/chosen": -516.0, + "logps/rejected": -502.0, + "loss": 0.7563, + "rewards/accuracies": 0.625, + "rewards/chosen": -10.3125, + "rewards/margins": 0.80078125, + "rewards/rejected": -11.125, + "step": 1490 + }, + { + "epoch": 0.7849293563579278, + "grad_norm": 7.4467241861253735, + "learning_rate": 4.612954350461161e-05, + "logits/chosen": -13.0625, + "logits/rejected": -12.875, + "logps/chosen": -468.0, + "logps/rejected": -508.0, + "loss": 0.569, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -10.3125, + "rewards/margins": 1.390625, + "rewards/rejected": -11.6875, + "step": 1500 + }, + { + "epoch": 0.7901622187336473, + "grad_norm": 7.183432507030938, + "learning_rate": 4.6047783982839274e-05, + "logits/chosen": -13.5, + "logits/rejected": -13.375, + "logps/chosen": -524.0, + "logps/rejected": -540.0, + "loss": 0.7395, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -11.0625, + "rewards/margins": 0.82421875, + "rewards/rejected": -11.875, + "step": 1510 + }, + { + "epoch": 0.7953950811093669, + "grad_norm": 6.780334689025147, + "learning_rate": 4.5965243959596785e-05, + "logits/chosen": -13.1875, + "logits/rejected": -13.5, + "logps/chosen": -472.0, + "logps/rejected": -464.0, + "loss": 0.6941, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -9.9375, + "rewards/margins": 0.94140625, + "rewards/rejected": -10.875, + "step": 1520 + }, + { + "epoch": 0.8006279434850864, + "grad_norm": 12.754455794530319, + "learning_rate": 4.5881926495662854e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.5625, + "logps/chosen": -572.0, + "logps/rejected": -648.0, + "loss": 0.7111, + "rewards/accuracies": 0.5625, + "rewards/chosen": -11.5, + "rewards/margins": 0.8828125, + "rewards/rejected": -12.375, + "step": 1530 + }, + { + "epoch": 0.8058608058608059, + "grad_norm": 7.504670748944528, + "learning_rate": 4.579783468064556e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.625, + "logps/chosen": -580.0, + "logps/rejected": -564.0, + "loss": 0.7643, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -11.0, + "rewards/margins": 0.62109375, + "rewards/rejected": -11.625, + "step": 1540 + }, + { + "epoch": 0.8110936682365254, + "grad_norm": 9.695054676896365, + "learning_rate": 4.5712971632867715e-05, + "logits/chosen": -12.75, + "logits/rejected": -13.0625, + "logps/chosen": -568.0, + "logps/rejected": -500.0, + "loss": 0.7573, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -9.625, + "rewards/margins": 1.1640625, + "rewards/rejected": -10.75, + "step": 1550 + }, + { + "epoch": 0.8163265306122449, + "grad_norm": 7.970194532217434, + "learning_rate": 4.5627340499251294e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.5625, + "logps/chosen": -572.0, + "logps/rejected": -548.0, + "loss": 0.7577, + "rewards/accuracies": 0.5625, + "rewards/chosen": -10.8125, + "rewards/margins": 0.66015625, + "rewards/rejected": -11.5, + "step": 1560 + }, + { + "epoch": 0.8215593929879644, + "grad_norm": 7.8570308161394085, + "learning_rate": 4.5540944455200666e-05, + "logits/chosen": -13.0, + "logits/rejected": -13.0, + "logps/chosen": -510.0, + "logps/rejected": -502.0, + "loss": 0.7306, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.9375, + "rewards/margins": 1.0859375, + "rewards/rejected": -12.0, + "step": 1570 + }, + { + "epoch": 0.826792255363684, + "grad_norm": 8.679285528741042, + "learning_rate": 4.545378670448492e-05, + "logits/chosen": -12.5625, + "logits/rejected": -12.9375, + "logps/chosen": -592.0, + "logps/rejected": -528.0, + "loss": 0.7772, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -10.75, + "rewards/margins": 0.94921875, + "rewards/rejected": -11.6875, + "step": 1580 + }, + { + "epoch": 0.8320251177394035, + "grad_norm": 7.294489262151519, + "learning_rate": 4.536587047911901e-05, + "logits/chosen": -11.9375, + "logits/rejected": -12.1875, + "logps/chosen": -528.0, + "logps/rejected": -496.0, + "loss": 0.6925, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -9.875, + "rewards/margins": 1.109375, + "rewards/rejected": -11.0, + "step": 1590 + }, + { + "epoch": 0.837257980115123, + "grad_norm": 10.134656229651304, + "learning_rate": 4.527719903924392e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.5, + "logps/chosen": -508.0, + "logps/rejected": -544.0, + "loss": 0.7312, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -10.375, + "rewards/margins": 0.81640625, + "rewards/rejected": -11.125, + "step": 1600 + }, + { + "epoch": 0.8424908424908425, + "grad_norm": 8.860873669647612, + "learning_rate": 4.518777567300575e-05, + "logits/chosen": -11.125, + "logits/rejected": -11.625, + "logps/chosen": -596.0, + "logps/rejected": -552.0, + "loss": 0.7317, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -10.8125, + "rewards/margins": 1.2578125, + "rewards/rejected": -12.0625, + "step": 1610 + }, + { + "epoch": 0.847723704866562, + "grad_norm": 6.680364487046663, + "learning_rate": 4.5097603696433845e-05, + "logits/chosen": -11.875, + "logits/rejected": -12.25, + "logps/chosen": -536.0, + "logps/rejected": -490.0, + "loss": 0.8224, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": -11.625, + "rewards/margins": 0.53125, + "rewards/rejected": -12.1875, + "step": 1620 + }, + { + "epoch": 0.8529565672422815, + "grad_norm": 8.12581157579285, + "learning_rate": 4.5006686453317734e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.5625, + "logps/chosen": -556.0, + "logps/rejected": -564.0, + "loss": 0.8097, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -11.0625, + "rewards/margins": 1.015625, + "rewards/rejected": -12.0625, + "step": 1630 + }, + { + "epoch": 0.858189429618001, + "grad_norm": 8.268449988760125, + "learning_rate": 4.4915027315083246e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.375, + "logps/chosen": -588.0, + "logps/rejected": -548.0, + "loss": 0.7346, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -10.375, + "rewards/margins": 0.9453125, + "rewards/rejected": -11.3125, + "step": 1640 + }, + { + "epoch": 0.8634222919937206, + "grad_norm": 9.632043499380252, + "learning_rate": 4.4822629680667375e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.3125, + "logps/chosen": -524.0, + "logps/rejected": -528.0, + "loss": 0.7777, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -10.25, + "rewards/margins": 0.9140625, + "rewards/rejected": -11.1875, + "step": 1650 + }, + { + "epoch": 0.8686551543694401, + "grad_norm": 6.63480458290659, + "learning_rate": 4.472949697639233e-05, + "logits/chosen": -13.0, + "logits/rejected": -13.0, + "logps/chosen": -480.0, + "logps/rejected": -478.0, + "loss": 0.7749, + "rewards/accuracies": 0.6875, + "rewards/chosen": -9.6875, + "rewards/margins": 1.078125, + "rewards/rejected": -10.75, + "step": 1660 + }, + { + "epoch": 0.8738880167451596, + "grad_norm": 8.946660909641642, + "learning_rate": 4.463563265583843e-05, + "logits/chosen": -12.6875, + "logits/rejected": -13.0625, + "logps/chosen": -556.0, + "logps/rejected": -532.0, + "loss": 0.7023, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -10.5, + "rewards/margins": 1.3359375, + "rewards/rejected": -11.875, + "step": 1670 + }, + { + "epoch": 0.8791208791208791, + "grad_norm": 8.79339110053614, + "learning_rate": 4.4541040199716066e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.5, + "logps/chosen": -508.0, + "logps/rejected": -532.0, + "loss": 0.6736, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -10.125, + "rewards/margins": 1.4296875, + "rewards/rejected": -11.5625, + "step": 1680 + }, + { + "epoch": 0.8843537414965986, + "grad_norm": 7.324747045901078, + "learning_rate": 4.444572311573659e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.375, + "logps/chosen": -536.0, + "logps/rejected": -506.0, + "loss": 0.6513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.125, + "rewards/margins": 0.73046875, + "rewards/rejected": -11.875, + "step": 1690 + }, + { + "epoch": 0.8895866038723181, + "grad_norm": 9.433520118340347, + "learning_rate": 4.4349684938482286e-05, + "logits/chosen": -13.125, + "logits/rejected": -13.6875, + "logps/chosen": -564.0, + "logps/rejected": -498.0, + "loss": 0.5843, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": -10.8125, + "rewards/margins": 1.5546875, + "rewards/rejected": -12.375, + "step": 1700 + }, + { + "epoch": 0.8948194662480377, + "grad_norm": 7.781493747070478, + "learning_rate": 4.4252929229275255e-05, + "logits/chosen": -12.8125, + "logits/rejected": -13.1875, + "logps/chosen": -576.0, + "logps/rejected": -556.0, + "loss": 0.7485, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -11.6875, + "rewards/margins": 1.1328125, + "rewards/rejected": -12.875, + "step": 1710 + }, + { + "epoch": 0.9000523286237572, + "grad_norm": 7.028647219407263, + "learning_rate": 4.41554595760454e-05, + "logits/chosen": -13.0, + "logits/rejected": -13.375, + "logps/chosen": -568.0, + "logps/rejected": -536.0, + "loss": 0.6974, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -11.9375, + "rewards/margins": 0.953125, + "rewards/rejected": -12.9375, + "step": 1720 + }, + { + "epoch": 0.9052851909994767, + "grad_norm": 14.387364821525113, + "learning_rate": 4.405727959319733e-05, + "logits/chosen": -12.875, + "logits/rejected": -13.0, + "logps/chosen": -528.0, + "logps/rejected": -536.0, + "loss": 0.7899, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -11.0625, + "rewards/margins": 1.5, + "rewards/rejected": -12.5625, + "step": 1730 + }, + { + "epoch": 0.9105180533751962, + "grad_norm": 8.271769950792597, + "learning_rate": 4.3958392921476376e-05, + "logits/chosen": -12.625, + "logits/rejected": -13.125, + "logps/chosen": -576.0, + "logps/rejected": -532.0, + "loss": 0.8312, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -11.8125, + "rewards/margins": 0.8359375, + "rewards/rejected": -12.625, + "step": 1740 + }, + { + "epoch": 0.9157509157509157, + "grad_norm": 8.418576821960974, + "learning_rate": 4.385880322783353e-05, + "logits/chosen": -12.9375, + "logits/rejected": -13.0625, + "logps/chosen": -604.0, + "logps/rejected": -592.0, + "loss": 0.7557, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -11.5, + "rewards/margins": 1.2578125, + "rewards/rejected": -12.75, + "step": 1750 + }, + { + "epoch": 0.9209837781266352, + "grad_norm": 12.101413536769304, + "learning_rate": 4.375851420528952e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.8125, + "logps/chosen": -552.0, + "logps/rejected": -584.0, + "loss": 0.7447, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -11.1875, + "rewards/margins": 0.95703125, + "rewards/rejected": -12.125, + "step": 1760 + }, + { + "epoch": 0.9262166405023547, + "grad_norm": 6.9704706163859225, + "learning_rate": 4.3657529572797804e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.4375, + "logps/chosen": -576.0, + "logps/rejected": -568.0, + "loss": 0.8642, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -11.5, + "rewards/margins": 0.640625, + "rewards/rejected": -12.1875, + "step": 1770 + }, + { + "epoch": 0.9314495028780743, + "grad_norm": 7.989756027543109, + "learning_rate": 4.355585307510675e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.875, + "logps/chosen": -568.0, + "logps/rejected": -568.0, + "loss": 0.7395, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -11.125, + "rewards/margins": 1.6796875, + "rewards/rejected": -12.8125, + "step": 1780 + }, + { + "epoch": 0.9366823652537938, + "grad_norm": 7.4392668707427525, + "learning_rate": 4.345348848262068e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.375, + "logps/chosen": -588.0, + "logps/rejected": -584.0, + "loss": 0.7134, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -11.125, + "rewards/margins": 1.2109375, + "rewards/rejected": -12.375, + "step": 1790 + }, + { + "epoch": 0.9419152276295133, + "grad_norm": 8.83048121979174, + "learning_rate": 4.3350439591260105e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.8125, + "logps/chosen": -536.0, + "logps/rejected": -572.0, + "loss": 0.8313, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -11.75, + "rewards/margins": 1.125, + "rewards/rejected": -12.875, + "step": 1800 + }, + { + "epoch": 0.9471480900052328, + "grad_norm": 7.162765830891214, + "learning_rate": 4.3246710222320956e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.75, + "logps/chosen": -486.0, + "logps/rejected": -492.0, + "loss": 0.7147, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": -10.9375, + "rewards/margins": 1.2578125, + "rewards/rejected": -12.1875, + "step": 1810 + }, + { + "epoch": 0.9523809523809523, + "grad_norm": 7.105352599980392, + "learning_rate": 4.314230422233286e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.5, + "logps/chosen": -450.0, + "logps/rejected": -448.0, + "loss": 0.7855, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": -9.9375, + "rewards/margins": 0.65625, + "rewards/rejected": -10.625, + "step": 1820 + }, + { + "epoch": 0.957613814756672, + "grad_norm": 10.300313199279667, + "learning_rate": 4.303722546291656e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.625, + "logps/chosen": -544.0, + "logps/rejected": -510.0, + "loss": 0.7179, + "rewards/accuracies": 0.6875, + "rewards/chosen": -11.125, + "rewards/margins": 0.8203125, + "rewards/rejected": -11.9375, + "step": 1830 + }, + { + "epoch": 0.9628466771323915, + "grad_norm": 7.351746246119899, + "learning_rate": 4.293147784064025e-05, + "logits/chosen": -12.8125, + "logits/rejected": -13.1875, + "logps/chosen": -568.0, + "logps/rejected": -544.0, + "loss": 0.6308, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -11.5, + "rewards/margins": 1.28125, + "rewards/rejected": -12.75, + "step": 1840 + }, + { + "epoch": 0.968079539508111, + "grad_norm": 9.715474841922566, + "learning_rate": 4.282506527687518e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.5625, + "logps/chosen": -640.0, + "logps/rejected": -576.0, + "loss": 0.7679, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -11.9375, + "rewards/margins": 1.0390625, + "rewards/rejected": -13.0, + "step": 1850 + }, + { + "epoch": 0.9733124018838305, + "grad_norm": 7.494793387712396, + "learning_rate": 4.2717991717650164e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.125, + "logps/chosen": -560.0, + "logps/rejected": -520.0, + "loss": 0.8072, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -11.6875, + "rewards/margins": 0.61328125, + "rewards/rejected": -12.25, + "step": 1860 + }, + { + "epoch": 0.97854526425955, + "grad_norm": 9.106930582299938, + "learning_rate": 4.261026113350532e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.5625, + "logps/chosen": -516.0, + "logps/rejected": -472.0, + "loss": 0.7537, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -10.5, + "rewards/margins": 0.53125, + "rewards/rejected": -11.0, + "step": 1870 + }, + { + "epoch": 0.9837781266352695, + "grad_norm": 6.646188139442894, + "learning_rate": 4.25018775193448e-05, + "logits/chosen": -12.125, + "logits/rejected": -11.9375, + "logps/chosen": -540.0, + "logps/rejected": -556.0, + "loss": 0.7246, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": -11.1875, + "rewards/margins": 0.875, + "rewards/rejected": -12.125, + "step": 1880 + }, + { + "epoch": 0.989010989010989, + "grad_norm": 10.495153682929477, + "learning_rate": 4.239284489428861e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.4375, + "logps/chosen": -636.0, + "logps/rejected": -620.0, + "loss": 0.6875, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": -11.75, + "rewards/margins": 0.9453125, + "rewards/rejected": -12.6875, + "step": 1890 + }, + { + "epoch": 0.9942438513867086, + "grad_norm": 8.896250743023822, + "learning_rate": 4.2283167301523636e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.0625, + "logps/chosen": -540.0, + "logps/rejected": -540.0, + "loss": 0.7295, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": -11.625, + "rewards/margins": 0.8515625, + "rewards/rejected": -12.4375, + "step": 1900 + }, + { + "epoch": 0.9994767137624281, + "grad_norm": 9.076036052295438, + "learning_rate": 4.217284880815369e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.3125, + "logps/chosen": -608.0, + "logps/rejected": -560.0, + "loss": 0.7494, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": -11.875, + "rewards/margins": 0.6796875, + "rewards/rejected": -12.5625, + "step": 1910 + }, + { + "epoch": 1.0, + "eval_logits/chosen": -12.5625, + "eval_logits/rejected": -12.8125, + "eval_logps/chosen": -552.0, + "eval_logps/rejected": -548.0, + "eval_loss": 0.7878593802452087, + "eval_rewards/accuracies": 0.6796875, + "eval_rewards/chosen": -11.6875, + "eval_rewards/margins": 1.1796875, + "eval_rewards/rejected": -12.8125, + "eval_runtime": 47.5543, + "eval_samples_per_second": 42.057, + "eval_steps_per_second": 0.673, + "step": 1911 + }, + { + "epoch": 1.0047095761381475, + "grad_norm": 5.025796276275169, + "learning_rate": 4.20618935050487e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.875, + "logps/chosen": -482.0, + "logps/rejected": -528.0, + "loss": 0.3374, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.125, + "rewards/margins": 3.65625, + "rewards/rejected": -13.75, + "step": 1920 + }, + { + "epoch": 1.0099424385138671, + "grad_norm": 2.617047145095191, + "learning_rate": 4.195030550669297e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.875, + "logps/chosen": -576.0, + "logps/rejected": -624.0, + "loss": 0.1885, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.3125, + "rewards/margins": 6.96875, + "rewards/rejected": -16.25, + "step": 1930 + }, + { + "epoch": 1.0151753008895865, + "grad_norm": 3.8017233289248216, + "learning_rate": 4.1838088951032665e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.4375, + "logps/chosen": -528.0, + "logps/rejected": -660.0, + "loss": 0.1563, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.5, + "rewards/margins": 7.28125, + "rewards/rejected": -16.75, + "step": 1940 + }, + { + "epoch": 1.0204081632653061, + "grad_norm": 2.1972464246724654, + "learning_rate": 4.1725247999322316e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.125, + "logps/chosen": -502.0, + "logps/rejected": -588.0, + "loss": 0.1881, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.6875, + "rewards/margins": 5.1875, + "rewards/rejected": -15.875, + "step": 1950 + }, + { + "epoch": 1.0256410256410255, + "grad_norm": 2.912719312978293, + "learning_rate": 4.161178683597054e-05, + "logits/chosen": -12.125, + "logits/rejected": -11.6875, + "logps/chosen": -528.0, + "logps/rejected": -568.0, + "loss": 0.1273, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.375, + "rewards/margins": 5.90625, + "rewards/rejected": -16.25, + "step": 1960 + }, + { + "epoch": 1.0308738880167452, + "grad_norm": 2.7302757245440983, + "learning_rate": 4.149770966838489e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.375, + "logps/chosen": -580.0, + "logps/rejected": -680.0, + "loss": 0.1697, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.1875, + "rewards/margins": 9.1875, + "rewards/rejected": -18.375, + "step": 1970 + }, + { + "epoch": 1.0361067503924646, + "grad_norm": 2.7591867060534776, + "learning_rate": 4.1383020726815744e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.5, + "logps/chosen": -484.0, + "logps/rejected": -572.0, + "loss": 0.13, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.8125, + "rewards/margins": 5.84375, + "rewards/rejected": -15.6875, + "step": 1980 + }, + { + "epoch": 1.0413396127681842, + "grad_norm": 3.8734929943590486, + "learning_rate": 4.1267724264199595e-05, + "logits/chosen": -13.1875, + "logits/rejected": -13.125, + "logps/chosen": -560.0, + "logps/rejected": -672.0, + "loss": 0.1567, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -9.375, + "rewards/margins": 7.40625, + "rewards/rejected": -16.75, + "step": 1990 + }, + { + "epoch": 1.0465724751439036, + "grad_norm": 2.5264805572715905, + "learning_rate": 4.115182455600115e-05, + "logits/chosen": -13.125, + "logits/rejected": -13.4375, + "logps/chosen": -532.0, + "logps/rejected": -592.0, + "loss": 0.2246, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -9.9375, + "rewards/margins": 5.71875, + "rewards/rejected": -15.6875, + "step": 2000 + }, + { + "epoch": 1.0518053375196232, + "grad_norm": 1.4136788093992387, + "learning_rate": 4.103532590005496e-05, + "logits/chosen": -13.375, + "logits/rejected": -13.4375, + "logps/chosen": -524.0, + "logps/rejected": -620.0, + "loss": 0.1506, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.5, + "rewards/margins": 7.3125, + "rewards/rejected": -17.75, + "step": 2010 + }, + { + "epoch": 1.0570381998953426, + "grad_norm": 5.8472125628304905, + "learning_rate": 4.0918232616405925e-05, + "logits/chosen": -13.375, + "logits/rejected": -13.4375, + "logps/chosen": -540.0, + "logps/rejected": -604.0, + "loss": 0.1525, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.5625, + "rewards/margins": 6.84375, + "rewards/rejected": -17.375, + "step": 2020 + }, + { + "epoch": 1.0622710622710623, + "grad_norm": 3.7593254661033937, + "learning_rate": 4.080054904714917e-05, + "logits/chosen": -13.75, + "logits/rejected": -13.4375, + "logps/chosen": -544.0, + "logps/rejected": -680.0, + "loss": 0.168, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.3125, + "rewards/margins": 7.53125, + "rewards/rejected": -18.875, + "step": 2030 + }, + { + "epoch": 1.0675039246467817, + "grad_norm": 2.4096345005426985, + "learning_rate": 4.0682279556269e-05, + "logits/chosen": -13.75, + "logits/rejected": -13.75, + "logps/chosen": -612.0, + "logps/rejected": -704.0, + "loss": 0.155, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.0625, + "rewards/margins": 7.5625, + "rewards/rejected": -18.625, + "step": 2040 + }, + { + "epoch": 1.0727367870225013, + "grad_norm": 3.736602458261602, + "learning_rate": 4.056342852947706e-05, + "logits/chosen": -13.4375, + "logits/rejected": -13.75, + "logps/chosen": -564.0, + "logps/rejected": -640.0, + "loss": 0.1078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.0625, + "rewards/margins": 7.625, + "rewards/rejected": -17.75, + "step": 2050 + }, + { + "epoch": 1.077969649398221, + "grad_norm": 5.699200852229238, + "learning_rate": 4.044400037404974e-05, + "logits/chosen": -14.125, + "logits/rejected": -13.75, + "logps/chosen": -488.0, + "logps/rejected": -616.0, + "loss": 0.165, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -9.5625, + "rewards/margins": 7.0625, + "rewards/rejected": -16.625, + "step": 2060 + }, + { + "epoch": 1.0832025117739403, + "grad_norm": 9.084933877985728, + "learning_rate": 4.032399951866469e-05, + "logits/chosen": -13.5, + "logits/rejected": -13.5, + "logps/chosen": -456.0, + "logps/rejected": -528.0, + "loss": 0.1929, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.375, + "rewards/margins": 5.46875, + "rewards/rejected": -14.875, + "step": 2070 + }, + { + "epoch": 1.08843537414966, + "grad_norm": 2.1048784655877046, + "learning_rate": 4.020343041323664e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.75, + "logps/chosen": -540.0, + "logps/rejected": -632.0, + "loss": 0.2385, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -9.5625, + "rewards/margins": 6.9375, + "rewards/rejected": -16.5, + "step": 2080 + }, + { + "epoch": 1.0936682365253794, + "grad_norm": 1.3259403661494187, + "learning_rate": 4.008229752875241e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.125, + "logps/chosen": -470.0, + "logps/rejected": -588.0, + "loss": 0.134, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.375, + "rewards/margins": 6.71875, + "rewards/rejected": -16.125, + "step": 2090 + }, + { + "epoch": 1.098901098901099, + "grad_norm": 5.124076145163906, + "learning_rate": 3.996060535710501e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.875, + "logps/chosen": -524.0, + "logps/rejected": -612.0, + "loss": 0.1352, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.6875, + "rewards/margins": 7.4375, + "rewards/rejected": -17.125, + "step": 2100 + }, + { + "epoch": 1.1041339612768184, + "grad_norm": 1.8708820089615186, + "learning_rate": 3.9838358410927165e-05, + "logits/chosen": -11.625, + "logits/rejected": -12.0, + "logps/chosen": -560.0, + "logps/rejected": -620.0, + "loss": 0.1389, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.4375, + "rewards/margins": 6.75, + "rewards/rejected": -17.125, + "step": 2110 + }, + { + "epoch": 1.109366823652538, + "grad_norm": 8.468865024177259, + "learning_rate": 3.9715561223423984e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.25, + "logps/chosen": -512.0, + "logps/rejected": -636.0, + "loss": 0.1779, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.8125, + "rewards/margins": 7.1875, + "rewards/rejected": -18.0, + "step": 2120 + }, + { + "epoch": 1.1145996860282574, + "grad_norm": 4.553312461630311, + "learning_rate": 3.959221834820477e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.375, + "logps/chosen": -548.0, + "logps/rejected": -572.0, + "loss": 0.2247, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.4375, + "rewards/margins": 5.625, + "rewards/rejected": -16.125, + "step": 2130 + }, + { + "epoch": 1.119832548403977, + "grad_norm": 4.725944299007761, + "learning_rate": 3.946833435911424e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.875, + "logps/chosen": -544.0, + "logps/rejected": -680.0, + "loss": 0.155, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.5, + "rewards/margins": 7.5, + "rewards/rejected": -19.0, + "step": 2140 + }, + { + "epoch": 1.1250654107796965, + "grad_norm": 5.280153035514204, + "learning_rate": 3.9343913850062855e-05, + "logits/chosen": -13.3125, + "logits/rejected": -12.9375, + "logps/chosen": -496.0, + "logps/rejected": -668.0, + "loss": 0.1808, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.0625, + "rewards/margins": 7.25, + "rewards/rejected": -18.375, + "step": 2150 + }, + { + "epoch": 1.130298273155416, + "grad_norm": 2.203699427092351, + "learning_rate": 3.921896143485657e-05, + "logits/chosen": -13.0, + "logits/rejected": -13.25, + "logps/chosen": -556.0, + "logps/rejected": -624.0, + "loss": 0.1895, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.1875, + "rewards/margins": 6.375, + "rewards/rejected": -17.5, + "step": 2160 + }, + { + "epoch": 1.1355311355311355, + "grad_norm": 5.252548484606017, + "learning_rate": 3.909348174702562e-05, + "logits/chosen": -13.375, + "logits/rejected": -13.4375, + "logps/chosen": -544.0, + "logps/rejected": -636.0, + "loss": 0.1515, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.0625, + "rewards/margins": 6.9375, + "rewards/rejected": -17.0, + "step": 2170 + }, + { + "epoch": 1.1407639979068551, + "grad_norm": 3.319289830814653, + "learning_rate": 3.8967479439652755e-05, + "logits/chosen": -13.125, + "logits/rejected": -13.0, + "logps/chosen": -504.0, + "logps/rejected": -648.0, + "loss": 0.1031, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.125, + "rewards/margins": 7.6875, + "rewards/rejected": -17.75, + "step": 2180 + }, + { + "epoch": 1.1459968602825745, + "grad_norm": 2.762256013302049, + "learning_rate": 3.884095918520072e-05, + "logits/chosen": -13.125, + "logits/rejected": -13.0625, + "logps/chosen": -516.0, + "logps/rejected": -632.0, + "loss": 0.1179, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.125, + "rewards/margins": 6.6875, + "rewards/rejected": -16.875, + "step": 2190 + }, + { + "epoch": 1.1512297226582942, + "grad_norm": 11.413493442922952, + "learning_rate": 3.871392567533893e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.875, + "logps/chosen": -556.0, + "logps/rejected": -648.0, + "loss": 0.1835, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -9.8125, + "rewards/margins": 7.8125, + "rewards/rejected": -17.625, + "step": 2200 + }, + { + "epoch": 1.1564625850340136, + "grad_norm": 4.885527079207722, + "learning_rate": 3.8586383620769536e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.9375, + "logps/chosen": -548.0, + "logps/rejected": -620.0, + "loss": 0.1468, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.75, + "rewards/margins": 6.9375, + "rewards/rejected": -17.75, + "step": 2210 + }, + { + "epoch": 1.1616954474097332, + "grad_norm": 4.344316843378256, + "learning_rate": 3.845833775105272e-05, + "logits/chosen": -13.125, + "logits/rejected": -13.125, + "logps/chosen": -524.0, + "logps/rejected": -672.0, + "loss": 0.1325, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.875, + "rewards/margins": 8.4375, + "rewards/rejected": -19.25, + "step": 2220 + }, + { + "epoch": 1.1669283097854526, + "grad_norm": 8.466061149421654, + "learning_rate": 3.832979281443133e-05, + "logits/chosen": -13.0, + "logits/rejected": -13.0, + "logps/chosen": -596.0, + "logps/rejected": -676.0, + "loss": 0.1534, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -11.0, + "rewards/margins": 7.25, + "rewards/rejected": -18.25, + "step": 2230 + }, + { + "epoch": 1.1721611721611722, + "grad_norm": 8.33172408761989, + "learning_rate": 3.8200753577654766e-05, + "logits/chosen": -13.5, + "logits/rejected": -13.4375, + "logps/chosen": -504.0, + "logps/rejected": -648.0, + "loss": 0.1957, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.75, + "rewards/margins": 7.65625, + "rewards/rejected": -18.375, + "step": 2240 + }, + { + "epoch": 1.1773940345368916, + "grad_norm": 2.8067450193311405, + "learning_rate": 3.807122482580228e-05, + "logits/chosen": -13.0, + "logits/rejected": -13.1875, + "logps/chosen": -540.0, + "logps/rejected": -644.0, + "loss": 0.1298, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.0, + "rewards/margins": 6.40625, + "rewards/rejected": -16.375, + "step": 2250 + }, + { + "epoch": 1.1826268969126112, + "grad_norm": 2.6171778099575698, + "learning_rate": 3.794121136210546e-05, + "logits/chosen": -13.5625, + "logits/rejected": -13.5625, + "logps/chosen": -544.0, + "logps/rejected": -660.0, + "loss": 0.1174, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.375, + "rewards/margins": 7.40625, + "rewards/rejected": -17.75, + "step": 2260 + }, + { + "epoch": 1.1878597592883307, + "grad_norm": 4.4393184562284045, + "learning_rate": 3.7810718007770175e-05, + "logits/chosen": -13.8125, + "logits/rejected": -13.8125, + "logps/chosen": -580.0, + "logps/rejected": -712.0, + "loss": 0.1337, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.1875, + "rewards/margins": 8.625, + "rewards/rejected": -18.875, + "step": 2270 + }, + { + "epoch": 1.1930926216640503, + "grad_norm": 2.8975735844666466, + "learning_rate": 3.7679749601797765e-05, + "logits/chosen": -13.6875, + "logits/rejected": -13.75, + "logps/chosen": -544.0, + "logps/rejected": -648.0, + "loss": 0.1665, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.875, + "rewards/margins": 8.0625, + "rewards/rejected": -17.875, + "step": 2280 + }, + { + "epoch": 1.1983254840397697, + "grad_norm": 4.705036027569537, + "learning_rate": 3.754831100080561e-05, + "logits/chosen": -14.125, + "logits/rejected": -14.0, + "logps/chosen": -500.0, + "logps/rejected": -660.0, + "loss": 0.1207, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.5, + "rewards/margins": 8.0625, + "rewards/rejected": -17.625, + "step": 2290 + }, + { + "epoch": 1.2035583464154893, + "grad_norm": 3.3554354031784976, + "learning_rate": 3.741640707884702e-05, + "logits/chosen": -13.625, + "logits/rejected": -13.75, + "logps/chosen": -516.0, + "logps/rejected": -640.0, + "loss": 0.1299, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.125, + "rewards/margins": 7.40625, + "rewards/rejected": -16.5, + "step": 2300 + }, + { + "epoch": 1.2087912087912087, + "grad_norm": 4.380253542568911, + "learning_rate": 3.728404272723051e-05, + "logits/chosen": -13.75, + "logits/rejected": -13.875, + "logps/chosen": -492.0, + "logps/rejected": -596.0, + "loss": 0.1698, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.8125, + "rewards/margins": 6.96875, + "rewards/rejected": -16.75, + "step": 2310 + }, + { + "epoch": 1.2140240711669283, + "grad_norm": 5.863469122548758, + "learning_rate": 3.715122285433842e-05, + "logits/chosen": -13.5625, + "logits/rejected": -13.875, + "logps/chosen": -560.0, + "logps/rejected": -632.0, + "loss": 0.1684, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.0, + "rewards/margins": 6.5625, + "rewards/rejected": -17.5, + "step": 2320 + }, + { + "epoch": 1.2192569335426477, + "grad_norm": 3.85026586750782, + "learning_rate": 3.701795238544488e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.25, + "logps/chosen": -564.0, + "logps/rejected": -644.0, + "loss": 0.2142, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.0, + "rewards/margins": 6.75, + "rewards/rejected": -17.75, + "step": 2330 + }, + { + "epoch": 1.2244897959183674, + "grad_norm": 2.8862551859168333, + "learning_rate": 3.6884236262533187e-05, + "logits/chosen": -13.1875, + "logits/rejected": -12.875, + "logps/chosen": -508.0, + "logps/rejected": -608.0, + "loss": 0.1627, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.1875, + "rewards/margins": 6.53125, + "rewards/rejected": -17.625, + "step": 2340 + }, + { + "epoch": 1.2297226582940868, + "grad_norm": 1.7267058938947848, + "learning_rate": 3.6750079444112535e-05, + "logits/chosen": -13.5625, + "logits/rejected": -13.25, + "logps/chosen": -548.0, + "logps/rejected": -636.0, + "loss": 0.2021, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.4375, + "rewards/margins": 6.46875, + "rewards/rejected": -17.875, + "step": 2350 + }, + { + "epoch": 1.2349555206698064, + "grad_norm": 1.8094583649915732, + "learning_rate": 3.661548690503417e-05, + "logits/chosen": -13.0625, + "logits/rejected": -12.75, + "logps/chosen": -552.0, + "logps/rejected": -652.0, + "loss": 0.1449, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -11.3125, + "rewards/margins": 7.125, + "rewards/rejected": -18.375, + "step": 2360 + }, + { + "epoch": 1.2401883830455258, + "grad_norm": 6.973218769623946, + "learning_rate": 3.648046363630685e-05, + "logits/chosen": -13.5, + "logits/rejected": -13.375, + "logps/chosen": -588.0, + "logps/rejected": -684.0, + "loss": 0.1426, + "rewards/accuracies": 0.9375, + "rewards/chosen": -12.0, + "rewards/margins": 7.4375, + "rewards/rejected": -19.375, + "step": 2370 + }, + { + "epoch": 1.2454212454212454, + "grad_norm": 2.724033971663383, + "learning_rate": 3.6345014644911835e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.5, + "logps/chosen": -544.0, + "logps/rejected": -640.0, + "loss": 0.1231, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.0625, + "rewards/margins": 7.34375, + "rewards/rejected": -18.375, + "step": 2380 + }, + { + "epoch": 1.250654107796965, + "grad_norm": 3.3380336722140207, + "learning_rate": 3.620914495361718e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.3125, + "logps/chosen": -616.0, + "logps/rejected": -736.0, + "loss": 0.1705, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -11.0625, + "rewards/margins": 8.25, + "rewards/rejected": -19.25, + "step": 2390 + }, + { + "epoch": 1.2558869701726845, + "grad_norm": 5.007066809175749, + "learning_rate": 3.607285960079146e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.8125, + "logps/chosen": -556.0, + "logps/rejected": -664.0, + "loss": 0.1405, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -9.75, + "rewards/margins": 8.0, + "rewards/rejected": -17.75, + "step": 2400 + }, + { + "epoch": 1.2611198325484039, + "grad_norm": 1.9696214011970825, + "learning_rate": 3.5936163640217014e-05, + "logits/chosen": -12.75, + "logits/rejected": -13.25, + "logps/chosen": -576.0, + "logps/rejected": -676.0, + "loss": 0.2191, + "rewards/accuracies": 0.9375, + "rewards/chosen": -9.3125, + "rewards/margins": 7.78125, + "rewards/rejected": -17.125, + "step": 2410 + }, + { + "epoch": 1.2663526949241235, + "grad_norm": 5.480736608377085, + "learning_rate": 3.5799062140902417e-05, + "logits/chosen": -12.875, + "logits/rejected": -13.5625, + "logps/chosen": -592.0, + "logps/rejected": -612.0, + "loss": 0.1494, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.5625, + "rewards/margins": 7.59375, + "rewards/rejected": -17.125, + "step": 2420 + }, + { + "epoch": 1.2715855572998431, + "grad_norm": 3.9789353089091826, + "learning_rate": 3.566156018689463e-05, + "logits/chosen": -13.3125, + "logits/rejected": -13.4375, + "logps/chosen": -556.0, + "logps/rejected": -608.0, + "loss": 0.246, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.0625, + "rewards/margins": 6.5, + "rewards/rejected": -17.5, + "step": 2430 + }, + { + "epoch": 1.2768184196755625, + "grad_norm": 3.161435784298808, + "learning_rate": 3.552366287709038e-05, + "logits/chosen": -13.25, + "logits/rejected": -13.25, + "logps/chosen": -536.0, + "logps/rejected": -640.0, + "loss": 0.1567, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.3125, + "rewards/margins": 6.8125, + "rewards/rejected": -17.125, + "step": 2440 + }, + { + "epoch": 1.282051282051282, + "grad_norm": 4.24755187840357, + "learning_rate": 3.5385375325047166e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.9375, + "logps/chosen": -532.0, + "logps/rejected": -592.0, + "loss": 0.1187, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.25, + "rewards/margins": 5.5625, + "rewards/rejected": -15.8125, + "step": 2450 + }, + { + "epoch": 1.2872841444270016, + "grad_norm": 9.226106460297151, + "learning_rate": 3.524670265879354e-05, + "logits/chosen": -13.125, + "logits/rejected": -13.0625, + "logps/chosen": -488.0, + "logps/rejected": -568.0, + "loss": 0.1767, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.5, + "rewards/margins": 5.875, + "rewards/rejected": -16.375, + "step": 2460 + }, + { + "epoch": 1.2925170068027212, + "grad_norm": 3.368430888172939, + "learning_rate": 3.5107650020639014e-05, + "logits/chosen": -13.1875, + "logits/rejected": -12.75, + "logps/chosen": -494.0, + "logps/rejected": -600.0, + "loss": 0.1158, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -9.875, + "rewards/margins": 5.875, + "rewards/rejected": -15.75, + "step": 2470 + }, + { + "epoch": 1.2977498691784406, + "grad_norm": 6.787912182490308, + "learning_rate": 3.496822256698337e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.5625, + "logps/chosen": -532.0, + "logps/rejected": -624.0, + "loss": 0.1839, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.75, + "rewards/margins": 6.84375, + "rewards/rejected": -17.625, + "step": 2480 + }, + { + "epoch": 1.30298273155416, + "grad_norm": 3.0940351195501674, + "learning_rate": 3.482842546812544e-05, + "logits/chosen": -12.75, + "logits/rejected": -13.0, + "logps/chosen": -604.0, + "logps/rejected": -648.0, + "loss": 0.0897, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.875, + "rewards/margins": 7.8125, + "rewards/rejected": -17.75, + "step": 2490 + }, + { + "epoch": 1.3082155939298796, + "grad_norm": 6.558994468239535, + "learning_rate": 3.468826390807131e-05, + "logits/chosen": -13.0625, + "logits/rejected": -12.9375, + "logps/chosen": -488.0, + "logps/rejected": -592.0, + "loss": 0.1485, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.3125, + "rewards/margins": 5.96875, + "rewards/rejected": -16.25, + "step": 2500 + }, + { + "epoch": 1.3134484563055993, + "grad_norm": 1.2318773732626507, + "learning_rate": 3.454774308434222e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.6875, + "logps/chosen": -492.0, + "logps/rejected": -656.0, + "loss": 0.1238, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.375, + "rewards/margins": 6.71875, + "rewards/rejected": -17.125, + "step": 2510 + }, + { + "epoch": 1.3186813186813187, + "grad_norm": 5.803207144946933, + "learning_rate": 3.4406868207781725e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.75, + "logps/chosen": -548.0, + "logps/rejected": -604.0, + "loss": 0.1611, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.0625, + "rewards/margins": 6.15625, + "rewards/rejected": -17.25, + "step": 2520 + }, + { + "epoch": 1.323914181057038, + "grad_norm": 2.294633376201315, + "learning_rate": 3.4265644502362495e-05, + "logits/chosen": -12.6875, + "logits/rejected": -13.0625, + "logps/chosen": -580.0, + "logps/rejected": -680.0, + "loss": 0.1449, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.125, + "rewards/margins": 7.65625, + "rewards/rejected": -18.75, + "step": 2530 + }, + { + "epoch": 1.3291470434327577, + "grad_norm": 4.04713234571419, + "learning_rate": 3.4124077204992576e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.875, + "logps/chosen": -492.0, + "logps/rejected": -600.0, + "loss": 0.2189, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.8125, + "rewards/margins": 6.15625, + "rewards/rejected": -17.0, + "step": 2540 + }, + { + "epoch": 1.3343799058084773, + "grad_norm": 3.405504777424004, + "learning_rate": 3.398217156532125e-05, + "logits/chosen": -12.875, + "logits/rejected": -13.0625, + "logps/chosen": -552.0, + "logps/rejected": -632.0, + "loss": 0.1636, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -11.4375, + "rewards/margins": 6.40625, + "rewards/rejected": -17.875, + "step": 2550 + }, + { + "epoch": 1.3396127681841967, + "grad_norm": 3.1168289976514116, + "learning_rate": 3.383993284554431e-05, + "logits/chosen": -12.5625, + "logits/rejected": -12.75, + "logps/chosen": -544.0, + "logps/rejected": -608.0, + "loss": 0.1437, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.4375, + "rewards/margins": 6.6875, + "rewards/rejected": -17.125, + "step": 2560 + }, + { + "epoch": 1.3448456305599163, + "grad_norm": 3.505753194889169, + "learning_rate": 3.3697366320208955e-05, + "logits/chosen": -12.875, + "logits/rejected": -13.0, + "logps/chosen": -552.0, + "logps/rejected": -640.0, + "loss": 0.136, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.3125, + "rewards/margins": 7.0625, + "rewards/rejected": -17.375, + "step": 2570 + }, + { + "epoch": 1.3500784929356358, + "grad_norm": 4.005823523601315, + "learning_rate": 3.355447727601816e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.1875, + "logps/chosen": -528.0, + "logps/rejected": -636.0, + "loss": 0.2811, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.1875, + "rewards/margins": 7.3125, + "rewards/rejected": -17.5, + "step": 2580 + }, + { + "epoch": 1.3553113553113554, + "grad_norm": 4.516125334834659, + "learning_rate": 3.34112710116347e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.4375, + "logps/chosen": -568.0, + "logps/rejected": -704.0, + "loss": 0.2014, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.4375, + "rewards/margins": 7.4375, + "rewards/rejected": -17.875, + "step": 2590 + }, + { + "epoch": 1.3605442176870748, + "grad_norm": 2.4451528489453866, + "learning_rate": 3.326775283748459e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.25, + "logps/chosen": -492.0, + "logps/rejected": -608.0, + "loss": 0.1692, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.4375, + "rewards/margins": 6.5, + "rewards/rejected": -17.0, + "step": 2600 + }, + { + "epoch": 1.3657770800627944, + "grad_norm": 5.747504543634415, + "learning_rate": 3.3123928075560204e-05, + "logits/chosen": -12.5625, + "logits/rejected": -12.625, + "logps/chosen": -560.0, + "logps/rejected": -628.0, + "loss": 0.1808, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.25, + "rewards/margins": 6.625, + "rewards/rejected": -16.875, + "step": 2610 + }, + { + "epoch": 1.3710099424385138, + "grad_norm": 2.096012785984857, + "learning_rate": 3.297980205922294e-05, + "logits/chosen": -12.875, + "logits/rejected": -13.0, + "logps/chosen": -532.0, + "logps/rejected": -600.0, + "loss": 0.1871, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.5625, + "rewards/margins": 6.0625, + "rewards/rejected": -16.625, + "step": 2620 + }, + { + "epoch": 1.3762428048142334, + "grad_norm": 2.4760284272107365, + "learning_rate": 3.2835380133005375e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.8125, + "logps/chosen": -482.0, + "logps/rejected": -616.0, + "loss": 0.1288, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.625, + "rewards/margins": 6.375, + "rewards/rejected": -17.0, + "step": 2630 + }, + { + "epoch": 1.3814756671899528, + "grad_norm": 1.9511277267799827, + "learning_rate": 3.269066765241314e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.5, + "logps/chosen": -536.0, + "logps/rejected": -600.0, + "loss": 0.1738, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.1875, + "rewards/margins": 5.71875, + "rewards/rejected": -16.875, + "step": 2640 + }, + { + "epoch": 1.3867085295656725, + "grad_norm": 2.9900866237679318, + "learning_rate": 3.254566998372634e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.6875, + "logps/chosen": -506.0, + "logps/rejected": -640.0, + "loss": 0.1671, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.3125, + "rewards/margins": 6.34375, + "rewards/rejected": -17.625, + "step": 2650 + }, + { + "epoch": 1.3919413919413919, + "grad_norm": 4.0853159087986075, + "learning_rate": 3.240039250380048e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.625, + "logps/chosen": -580.0, + "logps/rejected": -760.0, + "loss": 0.1026, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.9375, + "rewards/margins": 8.4375, + "rewards/rejected": -19.375, + "step": 2660 + }, + { + "epoch": 1.3971742543171115, + "grad_norm": 3.301818111899913, + "learning_rate": 3.225484059986715e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.625, + "logps/chosen": -560.0, + "logps/rejected": -624.0, + "loss": 0.1356, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -11.875, + "rewards/margins": 6.4375, + "rewards/rejected": -18.25, + "step": 2670 + }, + { + "epoch": 1.402407116692831, + "grad_norm": 3.5961658429484795, + "learning_rate": 3.2109019669334216e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.6875, + "logps/chosen": -572.0, + "logps/rejected": -720.0, + "loss": 0.1695, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.5, + "rewards/margins": 9.6875, + "rewards/rejected": -20.25, + "step": 2680 + }, + { + "epoch": 1.4076399790685505, + "grad_norm": 4.222385432033824, + "learning_rate": 3.1962935119585705e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.875, + "logps/chosen": -484.0, + "logps/rejected": -636.0, + "loss": 0.1752, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.625, + "rewards/margins": 6.625, + "rewards/rejected": -17.25, + "step": 2690 + }, + { + "epoch": 1.41287284144427, + "grad_norm": 3.6697103875093746, + "learning_rate": 3.181659236778124e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.8125, + "logps/chosen": -580.0, + "logps/rejected": -656.0, + "loss": 0.1405, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.75, + "rewards/margins": 7.5625, + "rewards/rejected": -18.25, + "step": 2700 + }, + { + "epoch": 1.4181057038199896, + "grad_norm": 5.4446370048728, + "learning_rate": 3.166999684065521e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.5625, + "logps/chosen": -552.0, + "logps/rejected": -632.0, + "loss": 0.1835, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.625, + "rewards/margins": 7.0625, + "rewards/rejected": -17.625, + "step": 2710 + }, + { + "epoch": 1.423338566195709, + "grad_norm": 1.4971937298054474, + "learning_rate": 3.15231539743155e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.75, + "logps/chosen": -524.0, + "logps/rejected": -616.0, + "loss": 0.166, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.75, + "rewards/margins": 6.3125, + "rewards/rejected": -17.0, + "step": 2720 + }, + { + "epoch": 1.4285714285714286, + "grad_norm": 7.798045925589297, + "learning_rate": 3.1376069214041913e-05, + "logits/chosen": -12.875, + "logits/rejected": -13.0625, + "logps/chosen": -540.0, + "logps/rejected": -636.0, + "loss": 0.2303, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.5625, + "rewards/margins": 6.34375, + "rewards/rejected": -17.875, + "step": 2730 + }, + { + "epoch": 1.433804290947148, + "grad_norm": 7.892586302760298, + "learning_rate": 3.1228748014084246e-05, + "logits/chosen": -12.5625, + "logits/rejected": -12.75, + "logps/chosen": -552.0, + "logps/rejected": -644.0, + "loss": 0.2323, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.875, + "rewards/margins": 6.65625, + "rewards/rejected": -18.5, + "step": 2740 + }, + { + "epoch": 1.4390371533228676, + "grad_norm": 2.2260286497847126, + "learning_rate": 3.1081195837460055e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.5, + "logps/chosen": -528.0, + "logps/rejected": -688.0, + "loss": 0.1634, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -11.6875, + "rewards/margins": 6.90625, + "rewards/rejected": -18.5, + "step": 2750 + }, + { + "epoch": 1.4442700156985873, + "grad_norm": 3.902080264497474, + "learning_rate": 3.0933418155752026e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.75, + "logps/chosen": -580.0, + "logps/rejected": -604.0, + "loss": 0.1366, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.375, + "rewards/margins": 6.96875, + "rewards/rejected": -18.375, + "step": 2760 + }, + { + "epoch": 1.4495028780743067, + "grad_norm": 6.565793274642643, + "learning_rate": 3.0785420448905134e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.6875, + "logps/chosen": -576.0, + "logps/rejected": -652.0, + "loss": 0.1804, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -10.875, + "rewards/margins": 7.125, + "rewards/rejected": -18.0, + "step": 2770 + }, + { + "epoch": 1.454735740450026, + "grad_norm": 4.78069477411253, + "learning_rate": 3.063720820502339e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.3125, + "logps/chosen": -516.0, + "logps/rejected": -580.0, + "loss": 0.1679, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.625, + "rewards/margins": 6.15625, + "rewards/rejected": -16.75, + "step": 2780 + }, + { + "epoch": 1.4599686028257457, + "grad_norm": 2.9641760959749854, + "learning_rate": 3.0488786920166345e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.1875, + "logps/chosen": -548.0, + "logps/rejected": -700.0, + "loss": 0.1477, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.9375, + "rewards/margins": 7.3125, + "rewards/rejected": -18.25, + "step": 2790 + }, + { + "epoch": 1.4652014652014653, + "grad_norm": 3.1703555226123155, + "learning_rate": 3.03401620981453e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.375, + "logps/chosen": -588.0, + "logps/rejected": -724.0, + "loss": 0.1567, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -11.0625, + "rewards/margins": 8.125, + "rewards/rejected": -19.25, + "step": 2800 + }, + { + "epoch": 1.4704343275771847, + "grad_norm": 1.6396098144570719, + "learning_rate": 3.019133925031915e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.0625, + "logps/chosen": -572.0, + "logps/rejected": -692.0, + "loss": 0.1122, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -12.125, + "rewards/margins": 6.875, + "rewards/rejected": -19.0, + "step": 2810 + }, + { + "epoch": 1.4756671899529041, + "grad_norm": 3.24108170871626, + "learning_rate": 3.004232389539011e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.0, + "logps/chosen": -524.0, + "logps/rejected": -640.0, + "loss": 0.1385, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -11.25, + "rewards/margins": 7.09375, + "rewards/rejected": -18.375, + "step": 2820 + }, + { + "epoch": 1.4809000523286238, + "grad_norm": 1.6066493200954974, + "learning_rate": 2.9893121559198983e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.875, + "logps/chosen": -528.0, + "logps/rejected": -632.0, + "loss": 0.1041, + "rewards/accuracies": 0.875, + "rewards/chosen": -10.75, + "rewards/margins": 6.75, + "rewards/rejected": -17.5, + "step": 2830 + }, + { + "epoch": 1.4861329147043434, + "grad_norm": 1.4015525250582879, + "learning_rate": 2.974373777452027e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.375, + "logps/chosen": -588.0, + "logps/rejected": -664.0, + "loss": 0.2043, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.5625, + "rewards/margins": 6.90625, + "rewards/rejected": -17.5, + "step": 2840 + }, + { + "epoch": 1.4913657770800628, + "grad_norm": 3.1847220201497715, + "learning_rate": 2.959417808085702e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.375, + "logps/chosen": -548.0, + "logps/rejected": -636.0, + "loss": 0.1418, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.5625, + "rewards/margins": 7.5, + "rewards/rejected": -18.0, + "step": 2850 + }, + { + "epoch": 1.4965986394557822, + "grad_norm": 6.675033635871589, + "learning_rate": 2.9444448024235422e-05, + "logits/chosen": -11.875, + "logits/rejected": -12.0, + "logps/chosen": -544.0, + "logps/rejected": -696.0, + "loss": 0.1554, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.3125, + "rewards/margins": 8.6875, + "rewards/rejected": -18.0, + "step": 2860 + }, + { + "epoch": 1.5018315018315018, + "grad_norm": 1.8005269219564757, + "learning_rate": 2.9294553156999082e-05, + "logits/chosen": -11.75, + "logits/rejected": -11.8125, + "logps/chosen": -552.0, + "logps/rejected": -648.0, + "loss": 0.1726, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.25, + "rewards/margins": 6.53125, + "rewards/rejected": -16.75, + "step": 2870 + }, + { + "epoch": 1.5070643642072215, + "grad_norm": 3.5018889880690436, + "learning_rate": 2.9144499037603207e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.5625, + "logps/chosen": -540.0, + "logps/rejected": -652.0, + "loss": 0.1742, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.75, + "rewards/margins": 8.0625, + "rewards/rejected": -17.75, + "step": 2880 + }, + { + "epoch": 1.5122972265829409, + "grad_norm": 4.176359733461575, + "learning_rate": 2.8994291230408432e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.6875, + "logps/chosen": -600.0, + "logps/rejected": -684.0, + "loss": 0.1528, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.0, + "rewards/margins": 8.1875, + "rewards/rejected": -19.125, + "step": 2890 + }, + { + "epoch": 1.5175300889586603, + "grad_norm": 3.263054937351028, + "learning_rate": 2.8843935305474524e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.375, + "logps/chosen": -588.0, + "logps/rejected": -764.0, + "loss": 0.1019, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.8125, + "rewards/margins": 10.625, + "rewards/rejected": -21.5, + "step": 2900 + }, + { + "epoch": 1.5227629513343799, + "grad_norm": 6.019092926637461, + "learning_rate": 2.869343683835376e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.4375, + "logps/chosen": -536.0, + "logps/rejected": -676.0, + "loss": 0.174, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.75, + "rewards/margins": 7.53125, + "rewards/rejected": -19.25, + "step": 2910 + }, + { + "epoch": 1.5279958137100995, + "grad_norm": 3.2240174666369628, + "learning_rate": 2.8542801409884258e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.1875, + "logps/chosen": -588.0, + "logps/rejected": -736.0, + "loss": 0.1138, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.75, + "rewards/margins": 7.40625, + "rewards/rejected": -19.125, + "step": 2920 + }, + { + "epoch": 1.533228676085819, + "grad_norm": 3.2530918121387717, + "learning_rate": 2.839203460598297e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.25, + "logps/chosen": -600.0, + "logps/rejected": -752.0, + "loss": 0.1355, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.25, + "rewards/margins": 7.6875, + "rewards/rejected": -19.0, + "step": 2930 + }, + { + "epoch": 1.5384615384615383, + "grad_norm": 4.134756743952236, + "learning_rate": 2.824114201743856e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.3125, + "logps/chosen": -564.0, + "logps/rejected": -716.0, + "loss": 0.1474, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.5625, + "rewards/margins": 9.1875, + "rewards/rejected": -19.75, + "step": 2940 + }, + { + "epoch": 1.543694400837258, + "grad_norm": 5.566935384470763, + "learning_rate": 2.8090129239704083e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.6875, + "logps/chosen": -536.0, + "logps/rejected": -592.0, + "loss": 0.1594, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -11.3125, + "rewards/margins": 6.9375, + "rewards/rejected": -18.25, + "step": 2950 + }, + { + "epoch": 1.5489272632129776, + "grad_norm": 1.021451185458759, + "learning_rate": 2.7939001872689498e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.25, + "logps/chosen": -520.0, + "logps/rejected": -604.0, + "loss": 0.147, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.0, + "rewards/margins": 6.4375, + "rewards/rejected": -17.5, + "step": 2960 + }, + { + "epoch": 1.554160125588697, + "grad_norm": 4.827200174694787, + "learning_rate": 2.7787765520553984e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.375, + "logps/chosen": -536.0, + "logps/rejected": -632.0, + "loss": 0.1299, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.6875, + "rewards/margins": 7.21875, + "rewards/rejected": -17.875, + "step": 2970 + }, + { + "epoch": 1.5593929879644164, + "grad_norm": 4.491667565433733, + "learning_rate": 2.7636425791498178e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.25, + "logps/chosen": -592.0, + "logps/rejected": -672.0, + "loss": 0.161, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.3125, + "rewards/margins": 6.6875, + "rewards/rejected": -18.0, + "step": 2980 + }, + { + "epoch": 1.564625850340136, + "grad_norm": 3.4830523290144852, + "learning_rate": 2.748498829755615e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.8125, + "logps/chosen": -532.0, + "logps/rejected": -696.0, + "loss": 0.1582, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.3125, + "rewards/margins": 7.6875, + "rewards/rejected": -19.0, + "step": 2990 + }, + { + "epoch": 1.5698587127158556, + "grad_norm": 5.529987237479786, + "learning_rate": 2.7333458654387345e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.125, + "logps/chosen": -592.0, + "logps/rejected": -688.0, + "loss": 0.1609, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.75, + "rewards/margins": 7.40625, + "rewards/rejected": -18.125, + "step": 3000 + }, + { + "epoch": 1.575091575091575, + "grad_norm": 7.669263543730199, + "learning_rate": 2.7181842481068282e-05, + "logits/chosen": -11.875, + "logits/rejected": -11.9375, + "logps/chosen": -584.0, + "logps/rejected": -716.0, + "loss": 0.1702, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.6875, + "rewards/margins": 7.78125, + "rewards/rejected": -18.5, + "step": 3010 + }, + { + "epoch": 1.5803244374672945, + "grad_norm": 5.58838804324731, + "learning_rate": 2.703014539988428e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.5625, + "logps/chosen": -572.0, + "logps/rejected": -636.0, + "loss": 0.1976, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.6875, + "rewards/margins": 5.65625, + "rewards/rejected": -17.375, + "step": 3020 + }, + { + "epoch": 1.585557299843014, + "grad_norm": 4.214778326380665, + "learning_rate": 2.6878373036120852e-05, + "logits/chosen": -11.875, + "logits/rejected": -12.0625, + "logps/chosen": -596.0, + "logps/rejected": -648.0, + "loss": 0.1694, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.6875, + "rewards/margins": 5.84375, + "rewards/rejected": -17.5, + "step": 3030 + }, + { + "epoch": 1.5907901622187337, + "grad_norm": 4.08908410186959, + "learning_rate": 2.6726531017855194e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.125, + "logps/chosen": -524.0, + "logps/rejected": -668.0, + "loss": 0.1309, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.125, + "rewards/margins": 7.75, + "rewards/rejected": -17.875, + "step": 3040 + }, + { + "epoch": 1.5960230245944533, + "grad_norm": 2.1197256668242597, + "learning_rate": 2.657462497574747e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.25, + "logps/chosen": -498.0, + "logps/rejected": -576.0, + "loss": 0.1233, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.75, + "rewards/margins": 6.5, + "rewards/rejected": -17.25, + "step": 3050 + }, + { + "epoch": 1.6012558869701727, + "grad_norm": 2.5029724664557276, + "learning_rate": 2.642266054283198e-05, + "logits/chosen": -11.9375, + "logits/rejected": -12.375, + "logps/chosen": -600.0, + "logps/rejected": -660.0, + "loss": 0.1141, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.6875, + "rewards/margins": 7.8125, + "rewards/rejected": -18.5, + "step": 3060 + }, + { + "epoch": 1.6064887493458921, + "grad_norm": 3.899947414356875, + "learning_rate": 2.6270643354308288e-05, + "logits/chosen": -11.9375, + "logits/rejected": -12.0, + "logps/chosen": -564.0, + "logps/rejected": -684.0, + "loss": 0.1493, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.5, + "rewards/margins": 7.65625, + "rewards/rejected": -18.25, + "step": 3070 + }, + { + "epoch": 1.6117216117216118, + "grad_norm": 2.3195694442688977, + "learning_rate": 2.611857904733227e-05, + "logits/chosen": -11.8125, + "logits/rejected": -12.3125, + "logps/chosen": -560.0, + "logps/rejected": -620.0, + "loss": 0.1386, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.1875, + "rewards/margins": 7.28125, + "rewards/rejected": -17.5, + "step": 3080 + }, + { + "epoch": 1.6169544740973314, + "grad_norm": 2.0725325059582, + "learning_rate": 2.5966473260807078e-05, + "logits/chosen": -11.875, + "logits/rejected": -11.9375, + "logps/chosen": -576.0, + "logps/rejected": -720.0, + "loss": 0.125, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.375, + "rewards/margins": 8.625, + "rewards/rejected": -19.0, + "step": 3090 + }, + { + "epoch": 1.6221873364730508, + "grad_norm": 4.113993754263742, + "learning_rate": 2.5814331635173987e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.8125, + "logps/chosen": -560.0, + "logps/rejected": -656.0, + "loss": 0.172, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.8125, + "rewards/margins": 6.28125, + "rewards/rejected": -17.125, + "step": 3100 + }, + { + "epoch": 1.6274201988487702, + "grad_norm": 2.678743740836168, + "learning_rate": 2.5662159812203313e-05, + "logits/chosen": -12.0, + "logits/rejected": -11.875, + "logps/chosen": -512.0, + "logps/rejected": -636.0, + "loss": 0.1411, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.4375, + "rewards/margins": 6.5625, + "rewards/rejected": -17.0, + "step": 3110 + }, + { + "epoch": 1.6326530612244898, + "grad_norm": 2.9143799299622337, + "learning_rate": 2.550996343478514e-05, + "logits/chosen": -11.5625, + "logits/rejected": -12.125, + "logps/chosen": -568.0, + "logps/rejected": -652.0, + "loss": 0.1315, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.3125, + "rewards/margins": 8.375, + "rewards/rejected": -17.75, + "step": 3120 + }, + { + "epoch": 1.6378859236002095, + "grad_norm": 6.560949719147845, + "learning_rate": 2.535774814672008e-05, + "logits/chosen": -11.5, + "logits/rejected": -12.0, + "logps/chosen": -512.0, + "logps/rejected": -580.0, + "loss": 0.1572, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.5625, + "rewards/margins": 6.8125, + "rewards/rejected": -16.375, + "step": 3130 + }, + { + "epoch": 1.6431187859759289, + "grad_norm": 5.931912626292325, + "learning_rate": 2.5205519592509995e-05, + "logits/chosen": -11.9375, + "logits/rejected": -12.1875, + "logps/chosen": -502.0, + "logps/rejected": -624.0, + "loss": 0.1467, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.3125, + "rewards/margins": 7.125, + "rewards/rejected": -17.375, + "step": 3140 + }, + { + "epoch": 1.6483516483516483, + "grad_norm": 7.463777806346751, + "learning_rate": 2.505328341714873e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.375, + "logps/chosen": -536.0, + "logps/rejected": -592.0, + "loss": 0.1254, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.4375, + "rewards/margins": 6.53125, + "rewards/rejected": -17.0, + "step": 3150 + }, + { + "epoch": 1.653584510727368, + "grad_norm": 3.2753809334776762, + "learning_rate": 2.490104526591269e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.375, + "logps/chosen": -544.0, + "logps/rejected": -644.0, + "loss": 0.1273, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.1875, + "rewards/margins": 6.875, + "rewards/rejected": -17.0, + "step": 3160 + }, + { + "epoch": 1.6588173731030875, + "grad_norm": 2.3621905847019034, + "learning_rate": 2.474881078415156e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.9375, + "logps/chosen": -532.0, + "logps/rejected": -600.0, + "loss": 0.0922, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.3125, + "rewards/margins": 6.6875, + "rewards/rejected": -17.0, + "step": 3170 + }, + { + "epoch": 1.664050235478807, + "grad_norm": 6.109434642218217, + "learning_rate": 2.4596585617078982e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.25, + "logps/chosen": -544.0, + "logps/rejected": -672.0, + "loss": 0.1383, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.75, + "rewards/margins": 6.96875, + "rewards/rejected": -17.75, + "step": 3180 + }, + { + "epoch": 1.6692830978545263, + "grad_norm": 5.313432208482434, + "learning_rate": 2.4444375409563145e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.75, + "logps/chosen": -544.0, + "logps/rejected": -668.0, + "loss": 0.2005, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.875, + "rewards/margins": 8.125, + "rewards/rejected": -19.0, + "step": 3190 + }, + { + "epoch": 1.674515960230246, + "grad_norm": 9.780521211739286, + "learning_rate": 2.429218580591753e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.625, + "logps/chosen": -588.0, + "logps/rejected": -620.0, + "loss": 0.2085, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -11.1875, + "rewards/margins": 5.4375, + "rewards/rejected": -16.625, + "step": 3200 + }, + { + "epoch": 1.6797488226059656, + "grad_norm": 6.9752987720127475, + "learning_rate": 2.4140022449691583e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.125, + "logps/chosen": -580.0, + "logps/rejected": -692.0, + "loss": 0.1815, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.9375, + "rewards/margins": 7.625, + "rewards/rejected": -18.5, + "step": 3210 + }, + { + "epoch": 1.684981684981685, + "grad_norm": 2.104780380076254, + "learning_rate": 2.3987890983461407e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.375, + "logps/chosen": -580.0, + "logps/rejected": -744.0, + "loss": 0.1257, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.5, + "rewards/margins": 9.875, + "rewards/rejected": -20.375, + "step": 3220 + }, + { + "epoch": 1.6902145473574044, + "grad_norm": 4.920138400963844, + "learning_rate": 2.3835797048620567e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.625, + "logps/chosen": -552.0, + "logps/rejected": -616.0, + "loss": 0.2009, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.125, + "rewards/margins": 6.25, + "rewards/rejected": -17.375, + "step": 3230 + }, + { + "epoch": 1.695447409733124, + "grad_norm": 6.088610610369365, + "learning_rate": 2.368374628517088e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.25, + "logps/chosen": -532.0, + "logps/rejected": -624.0, + "loss": 0.1616, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.5, + "rewards/margins": 6.40625, + "rewards/rejected": -16.875, + "step": 3240 + }, + { + "epoch": 1.7006802721088436, + "grad_norm": 0.9648785608001027, + "learning_rate": 2.353174433151325e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.25, + "logps/chosen": -490.0, + "logps/rejected": -644.0, + "loss": 0.1135, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.4375, + "rewards/margins": 7.96875, + "rewards/rejected": -17.375, + "step": 3250 + }, + { + "epoch": 1.705913134484563, + "grad_norm": 7.125968557878401, + "learning_rate": 2.3379796824238608e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.3125, + "logps/chosen": -520.0, + "logps/rejected": -612.0, + "loss": 0.1642, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.8125, + "rewards/margins": 6.28125, + "rewards/rejected": -17.0, + "step": 3260 + }, + { + "epoch": 1.7111459968602825, + "grad_norm": 2.1436506196796894, + "learning_rate": 2.3227909397918897e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.6875, + "logps/chosen": -548.0, + "logps/rejected": -772.0, + "loss": 0.1473, + "rewards/accuracies": 1.0, + "rewards/chosen": -8.875, + "rewards/margins": 10.8125, + "rewards/rejected": -19.625, + "step": 3270 + }, + { + "epoch": 1.716378859236002, + "grad_norm": 3.8126804432159385, + "learning_rate": 2.307608768489808e-05, + "logits/chosen": -11.8125, + "logits/rejected": -12.125, + "logps/chosen": -564.0, + "logps/rejected": -700.0, + "loss": 0.1341, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.375, + "rewards/margins": 8.9375, + "rewards/rejected": -19.25, + "step": 3280 + }, + { + "epoch": 1.7216117216117217, + "grad_norm": 5.170043751490522, + "learning_rate": 2.2924337315083356e-05, + "logits/chosen": -11.9375, + "logits/rejected": -12.25, + "logps/chosen": -552.0, + "logps/rejected": -680.0, + "loss": 0.1538, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -11.375, + "rewards/margins": 7.25, + "rewards/rejected": -18.625, + "step": 3290 + }, + { + "epoch": 1.7268445839874411, + "grad_norm": 3.086020388242938, + "learning_rate": 2.277266391573633e-05, + "logits/chosen": -11.875, + "logits/rejected": -12.0625, + "logps/chosen": -568.0, + "logps/rejected": -676.0, + "loss": 0.1303, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.625, + "rewards/margins": 8.6875, + "rewards/rejected": -19.25, + "step": 3300 + }, + { + "epoch": 1.7320774463631605, + "grad_norm": 6.888685972751711, + "learning_rate": 2.262107311126436e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.375, + "logps/chosen": -540.0, + "logps/rejected": -640.0, + "loss": 0.1396, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.0, + "rewards/margins": 7.96875, + "rewards/rejected": -19.0, + "step": 3310 + }, + { + "epoch": 1.7373103087388801, + "grad_norm": 3.3674197456767434, + "learning_rate": 2.2469570523011996e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.0625, + "logps/chosen": -560.0, + "logps/rejected": -704.0, + "loss": 0.1428, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.9375, + "rewards/margins": 7.46875, + "rewards/rejected": -19.375, + "step": 3320 + }, + { + "epoch": 1.7425431711145998, + "grad_norm": 1.53918095587136, + "learning_rate": 2.2318161769052525e-05, + "logits/chosen": -11.875, + "logits/rejected": -12.0, + "logps/chosen": -592.0, + "logps/rejected": -676.0, + "loss": 0.1163, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -12.0, + "rewards/margins": 7.3125, + "rewards/rejected": -19.375, + "step": 3330 + }, + { + "epoch": 1.7477760334903192, + "grad_norm": 1.6924601836899695, + "learning_rate": 2.2166852463979625e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.5, + "logps/chosen": -548.0, + "logps/rejected": -672.0, + "loss": 0.1679, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.375, + "rewards/margins": 7.5625, + "rewards/rejected": -19.0, + "step": 3340 + }, + { + "epoch": 1.7530088958660386, + "grad_norm": 3.439235498676025, + "learning_rate": 2.2015648218699202e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.375, + "logps/chosen": -492.0, + "logps/rejected": -608.0, + "loss": 0.1573, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.0, + "rewards/margins": 7.875, + "rewards/rejected": -17.875, + "step": 3350 + }, + { + "epoch": 1.7582417582417582, + "grad_norm": 2.5725031256973536, + "learning_rate": 2.1864554640221245e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.25, + "logps/chosen": -520.0, + "logps/rejected": -632.0, + "loss": 0.1078, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -11.375, + "rewards/margins": 6.5625, + "rewards/rejected": -18.0, + "step": 3360 + }, + { + "epoch": 1.7634746206174778, + "grad_norm": 2.3780858254933848, + "learning_rate": 2.1713577331452017e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.6875, + "logps/chosen": -608.0, + "logps/rejected": -648.0, + "loss": 0.1472, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.9375, + "rewards/margins": 6.8125, + "rewards/rejected": -17.75, + "step": 3370 + }, + { + "epoch": 1.7687074829931972, + "grad_norm": 3.788476799560272, + "learning_rate": 2.1562721890986202e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.5625, + "logps/chosen": -536.0, + "logps/rejected": -636.0, + "loss": 0.1387, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.9375, + "rewards/margins": 7.53125, + "rewards/rejected": -18.5, + "step": 3380 + }, + { + "epoch": 1.7739403453689166, + "grad_norm": 3.0735850422446442, + "learning_rate": 2.1411993912899285e-05, + "logits/chosen": -12.0625, + "logits/rejected": -11.9375, + "logps/chosen": -516.0, + "logps/rejected": -676.0, + "loss": 0.1236, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.875, + "rewards/margins": 7.0625, + "rewards/rejected": -18.0, + "step": 3390 + }, + { + "epoch": 1.7791732077446363, + "grad_norm": 5.819898842195535, + "learning_rate": 2.126139898654021e-05, + "logits/chosen": -12.375, + "logits/rejected": -12.5625, + "logps/chosen": -552.0, + "logps/rejected": -628.0, + "loss": 0.1581, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.8125, + "rewards/margins": 7.5625, + "rewards/rejected": -18.375, + "step": 3400 + }, + { + "epoch": 1.784406070120356, + "grad_norm": 3.1283675613331243, + "learning_rate": 2.1110942696324017e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.8125, + "logps/chosen": -548.0, + "logps/rejected": -664.0, + "loss": 0.1817, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.0625, + "rewards/margins": 8.0, + "rewards/rejected": -18.125, + "step": 3410 + }, + { + "epoch": 1.7896389324960753, + "grad_norm": 5.783953308594588, + "learning_rate": 2.0960630621524763e-05, + "logits/chosen": -11.625, + "logits/rejected": -12.25, + "logps/chosen": -604.0, + "logps/rejected": -616.0, + "loss": 0.2123, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.5625, + "rewards/margins": 6.78125, + "rewards/rejected": -17.375, + "step": 3420 + }, + { + "epoch": 1.7948717948717947, + "grad_norm": 4.261494497992645, + "learning_rate": 2.0810468336068696e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.25, + "logps/chosen": -508.0, + "logps/rejected": -584.0, + "loss": 0.1938, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -10.9375, + "rewards/margins": 5.25, + "rewards/rejected": -16.25, + "step": 3430 + }, + { + "epoch": 1.8001046572475143, + "grad_norm": 1.741639534150338, + "learning_rate": 2.0660461408327536e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.75, + "logps/chosen": -600.0, + "logps/rejected": -716.0, + "loss": 0.1325, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.3125, + "rewards/margins": 9.0625, + "rewards/rejected": -19.375, + "step": 3440 + }, + { + "epoch": 1.805337519623234, + "grad_norm": 1.2924995018776002, + "learning_rate": 2.051061540091191e-05, + "logits/chosen": -11.8125, + "logits/rejected": -12.1875, + "logps/chosen": -548.0, + "logps/rejected": -628.0, + "loss": 0.1469, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -11.25, + "rewards/margins": 6.53125, + "rewards/rejected": -17.75, + "step": 3450 + }, + { + "epoch": 1.8105703819989536, + "grad_norm": 6.814794323471826, + "learning_rate": 2.0360935870465188e-05, + "logits/chosen": -11.875, + "logits/rejected": -12.25, + "logps/chosen": -584.0, + "logps/rejected": -680.0, + "loss": 0.137, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -11.25, + "rewards/margins": 7.59375, + "rewards/rejected": -18.875, + "step": 3460 + }, + { + "epoch": 1.815803244374673, + "grad_norm": 5.821728263769117, + "learning_rate": 2.021142836745739e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.375, + "logps/chosen": -572.0, + "logps/rejected": -656.0, + "loss": 0.1666, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -11.5, + "rewards/margins": 7.8125, + "rewards/rejected": -19.25, + "step": 3470 + }, + { + "epoch": 1.8210361067503924, + "grad_norm": 4.713492097008847, + "learning_rate": 2.006209843597931e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.0, + "logps/chosen": -604.0, + "logps/rejected": -716.0, + "loss": 0.1517, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.875, + "rewards/margins": 8.0, + "rewards/rejected": -19.875, + "step": 3480 + }, + { + "epoch": 1.826268969126112, + "grad_norm": 3.673927096473944, + "learning_rate": 1.9912951613537e-05, + "logits/chosen": -11.6875, + "logits/rejected": -12.125, + "logps/chosen": -564.0, + "logps/rejected": -636.0, + "loss": 0.1386, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -10.3125, + "rewards/margins": 7.34375, + "rewards/rejected": -17.625, + "step": 3490 + }, + { + "epoch": 1.8315018315018317, + "grad_norm": 7.301159459916146, + "learning_rate": 1.9763993430846395e-05, + "logits/chosen": -12.0625, + "logits/rejected": -12.5625, + "logps/chosen": -564.0, + "logps/rejected": -624.0, + "loss": 0.1443, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -11.4375, + "rewards/margins": 5.90625, + "rewards/rejected": -17.375, + "step": 3500 + }, + { + "epoch": 1.836734693877551, + "grad_norm": 3.656823847113072, + "learning_rate": 1.9615229411628215e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.0625, + "logps/chosen": -498.0, + "logps/rejected": -620.0, + "loss": 0.1425, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.3125, + "rewards/margins": 5.65625, + "rewards/rejected": -17.0, + "step": 3510 + }, + { + "epoch": 1.8419675562532705, + "grad_norm": 3.2318169109546697, + "learning_rate": 1.9466665072403142e-05, + "logits/chosen": -11.75, + "logits/rejected": -12.0625, + "logps/chosen": -588.0, + "logps/rejected": -672.0, + "loss": 0.117, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.5625, + "rewards/margins": 7.125, + "rewards/rejected": -18.75, + "step": 3520 + }, + { + "epoch": 1.84720041862899, + "grad_norm": 1.5645281327092355, + "learning_rate": 1.931830592228727e-05, + "logits/chosen": -11.6875, + "logits/rejected": -11.6875, + "logps/chosen": -524.0, + "logps/rejected": -640.0, + "loss": 0.1387, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.25, + "rewards/margins": 6.65625, + "rewards/rejected": -16.875, + "step": 3530 + }, + { + "epoch": 1.8524332810047097, + "grad_norm": 2.110816123401139, + "learning_rate": 1.9170157462787764e-05, + "logits/chosen": -11.75, + "logits/rejected": -12.125, + "logps/chosen": -608.0, + "logps/rejected": -648.0, + "loss": 0.1096, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.3125, + "rewards/margins": 7.40625, + "rewards/rejected": -17.75, + "step": 3540 + }, + { + "epoch": 1.8576661433804291, + "grad_norm": 6.374688050556006, + "learning_rate": 1.902222518759891e-05, + "logits/chosen": -11.4375, + "logits/rejected": -11.8125, + "logps/chosen": -604.0, + "logps/rejected": -724.0, + "loss": 0.1465, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.5, + "rewards/margins": 8.4375, + "rewards/rejected": -18.875, + "step": 3550 + }, + { + "epoch": 1.8628990057561485, + "grad_norm": 6.999097914110718, + "learning_rate": 1.887451458239837e-05, + "logits/chosen": -11.5625, + "logits/rejected": -11.6875, + "logps/chosen": -564.0, + "logps/rejected": -696.0, + "loss": 0.161, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.6875, + "rewards/margins": 7.71875, + "rewards/rejected": -18.375, + "step": 3560 + }, + { + "epoch": 1.8681318681318682, + "grad_norm": 4.0625683050938, + "learning_rate": 1.872703112464374e-05, + "logits/chosen": -12.0, + "logits/rejected": -12.3125, + "logps/chosen": -520.0, + "logps/rejected": -624.0, + "loss": 0.1101, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.375, + "rewards/margins": 6.65625, + "rewards/rejected": -17.0, + "step": 3570 + }, + { + "epoch": 1.8733647305075878, + "grad_norm": 2.9250058221000166, + "learning_rate": 1.8579780283369475e-05, + "logits/chosen": -11.875, + "logits/rejected": -12.125, + "logps/chosen": -556.0, + "logps/rejected": -640.0, + "loss": 0.1699, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -10.4375, + "rewards/margins": 7.40625, + "rewards/rejected": -17.875, + "step": 3580 + }, + { + "epoch": 1.8785975928833072, + "grad_norm": 3.3917682629483004, + "learning_rate": 1.8432767518984047e-05, + "logits/chosen": -12.3125, + "logits/rejected": -12.375, + "logps/chosen": -556.0, + "logps/rejected": -664.0, + "loss": 0.1231, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -11.0, + "rewards/margins": 7.5625, + "rewards/rejected": -18.625, + "step": 3590 + }, + { + "epoch": 1.8838304552590266, + "grad_norm": 4.3214481399637386, + "learning_rate": 1.828599828306748e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.625, + "logps/chosen": -506.0, + "logps/rejected": -604.0, + "loss": 0.1113, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -10.8125, + "rewards/margins": 7.0625, + "rewards/rejected": -17.875, + "step": 3600 + }, + { + "epoch": 1.8890633176347462, + "grad_norm": 5.266498753372491, + "learning_rate": 1.8139478018169197e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.375, + "logps/chosen": -520.0, + "logps/rejected": -640.0, + "loss": 0.1884, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -11.0625, + "rewards/margins": 7.5625, + "rewards/rejected": -18.625, + "step": 3610 + }, + { + "epoch": 1.8942961800104658, + "grad_norm": 1.5962079455355056, + "learning_rate": 1.7993212157606172e-05, + "logits/chosen": -12.1875, + "logits/rejected": -12.0625, + "logps/chosen": -516.0, + "logps/rejected": -624.0, + "loss": 0.1699, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.75, + "rewards/margins": 6.25, + "rewards/rejected": -17.0, + "step": 3620 + }, + { + "epoch": 1.8995290423861853, + "grad_norm": 4.621607845404804, + "learning_rate": 1.784720612526148e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.875, + "logps/chosen": -540.0, + "logps/rejected": -664.0, + "loss": 0.1533, + "rewards/accuracies": 0.9375, + "rewards/chosen": -11.9375, + "rewards/margins": 6.3125, + "rewards/rejected": -18.25, + "step": 3630 + }, + { + "epoch": 1.9047619047619047, + "grad_norm": 5.216061893758242, + "learning_rate": 1.770146533538315e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.875, + "logps/chosen": -544.0, + "logps/rejected": -608.0, + "loss": 0.1264, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -11.3125, + "rewards/margins": 6.875, + "rewards/rejected": -18.25, + "step": 3640 + }, + { + "epoch": 1.9099947671376243, + "grad_norm": 1.709150410480554, + "learning_rate": 1.755599519238338e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.375, + "logps/chosen": -500.0, + "logps/rejected": -712.0, + "loss": 0.1324, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.9375, + "rewards/margins": 8.3125, + "rewards/rejected": -19.25, + "step": 3650 + }, + { + "epoch": 1.915227629513344, + "grad_norm": 5.0117582881977425, + "learning_rate": 1.741080109063817e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.625, + "logps/chosen": -544.0, + "logps/rejected": -644.0, + "loss": 0.1836, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -12.1875, + "rewards/margins": 6.25, + "rewards/rejected": -18.5, + "step": 3660 + }, + { + "epoch": 1.9204604918890633, + "grad_norm": 3.8566029873849295, + "learning_rate": 1.7265888414287247e-05, + "logits/chosen": -12.4375, + "logits/rejected": -12.875, + "logps/chosen": -580.0, + "logps/rejected": -652.0, + "loss": 0.157, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -11.25, + "rewards/margins": 6.9375, + "rewards/rejected": -18.25, + "step": 3670 + }, + { + "epoch": 1.9256933542647827, + "grad_norm": 5.667257724840644, + "learning_rate": 1.7121262537034397e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.6875, + "logps/chosen": -604.0, + "logps/rejected": -688.0, + "loss": 0.1454, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -11.75, + "rewards/margins": 7.34375, + "rewards/rejected": -19.125, + "step": 3680 + }, + { + "epoch": 1.9309262166405023, + "grad_norm": 3.0310912811799913, + "learning_rate": 1.6976928821948263e-05, + "logits/chosen": -13.0625, + "logits/rejected": -13.0625, + "logps/chosen": -504.0, + "logps/rejected": -620.0, + "loss": 0.1329, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.9375, + "rewards/margins": 6.5625, + "rewards/rejected": -17.5, + "step": 3690 + }, + { + "epoch": 1.936159079016222, + "grad_norm": 5.731239526582649, + "learning_rate": 1.6832892621263407e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.875, + "logps/chosen": -588.0, + "logps/rejected": -708.0, + "loss": 0.1494, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -11.1875, + "rewards/margins": 8.8125, + "rewards/rejected": -20.0, + "step": 3700 + }, + { + "epoch": 1.9413919413919414, + "grad_norm": 4.90791941322355, + "learning_rate": 1.6689159276181832e-05, + "logits/chosen": -13.25, + "logits/rejected": -13.1875, + "logps/chosen": -516.0, + "logps/rejected": -628.0, + "loss": 0.1598, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -11.1875, + "rewards/margins": 6.84375, + "rewards/rejected": -18.0, + "step": 3710 + }, + { + "epoch": 1.9466248037676608, + "grad_norm": 5.067966638143667, + "learning_rate": 1.6545734116674966e-05, + "logits/chosen": -12.25, + "logits/rejected": -12.625, + "logps/chosen": -556.0, + "logps/rejected": -612.0, + "loss": 0.1497, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.8125, + "rewards/margins": 6.84375, + "rewards/rejected": -17.75, + "step": 3720 + }, + { + "epoch": 1.9518576661433804, + "grad_norm": 1.0617423449341068, + "learning_rate": 1.6402622461286003e-05, + "logits/chosen": -12.8125, + "logits/rejected": -13.25, + "logps/chosen": -588.0, + "logps/rejected": -656.0, + "loss": 0.1265, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.9375, + "rewards/margins": 7.25, + "rewards/rejected": -18.125, + "step": 3730 + }, + { + "epoch": 1.9570905285191, + "grad_norm": 4.244712168957068, + "learning_rate": 1.625982961693262e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.75, + "logps/chosen": -584.0, + "logps/rejected": -688.0, + "loss": 0.1025, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.125, + "rewards/margins": 8.6875, + "rewards/rejected": -18.75, + "step": 3740 + }, + { + "epoch": 1.9623233908948194, + "grad_norm": 6.34435554599616, + "learning_rate": 1.6117360878710265e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.875, + "logps/chosen": -536.0, + "logps/rejected": -640.0, + "loss": 0.1722, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.5, + "rewards/margins": 7.125, + "rewards/rejected": -17.625, + "step": 3750 + }, + { + "epoch": 1.9675562532705388, + "grad_norm": 4.415617650295368, + "learning_rate": 1.5975221529695774e-05, + "logits/chosen": -12.125, + "logits/rejected": -12.5, + "logps/chosen": -556.0, + "logps/rejected": -628.0, + "loss": 0.1358, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -11.0, + "rewards/margins": 7.3125, + "rewards/rejected": -18.25, + "step": 3760 + }, + { + "epoch": 1.9727891156462585, + "grad_norm": 3.059644737728734, + "learning_rate": 1.583341684075141e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.9375, + "logps/chosen": -510.0, + "logps/rejected": -612.0, + "loss": 0.1268, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -10.875, + "rewards/margins": 6.71875, + "rewards/rejected": -17.5, + "step": 3770 + }, + { + "epoch": 1.978021978021978, + "grad_norm": 5.247635738453357, + "learning_rate": 1.5691952070329495e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.6875, + "logps/chosen": -576.0, + "logps/rejected": -708.0, + "loss": 0.1451, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.6875, + "rewards/margins": 8.875, + "rewards/rejected": -19.625, + "step": 3780 + }, + { + "epoch": 1.9832548403976975, + "grad_norm": 7.712781298345494, + "learning_rate": 1.555083246427734e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.625, + "logps/chosen": -600.0, + "logps/rejected": -704.0, + "loss": 0.0989, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -11.125, + "rewards/margins": 7.875, + "rewards/rejected": -19.0, + "step": 3790 + }, + { + "epoch": 1.988487702773417, + "grad_norm": 1.7203767588512628, + "learning_rate": 1.541006325564277e-05, + "logits/chosen": -12.5, + "logits/rejected": -12.8125, + "logps/chosen": -560.0, + "logps/rejected": -680.0, + "loss": 0.1498, + "rewards/accuracies": 0.875, + "rewards/chosen": -11.8125, + "rewards/margins": 7.5, + "rewards/rejected": -19.375, + "step": 3800 + }, + { + "epoch": 1.9937205651491365, + "grad_norm": 0.9271784942387864, + "learning_rate": 1.5269649664480038e-05, + "logits/chosen": -12.9375, + "logits/rejected": -13.125, + "logps/chosen": -580.0, + "logps/rejected": -680.0, + "loss": 0.1312, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -11.0625, + "rewards/margins": 8.25, + "rewards/rejected": -19.25, + "step": 3810 + }, + { + "epoch": 1.9989534275248562, + "grad_norm": 1.754148795500601, + "learning_rate": 1.5129596897656257e-05, + "logits/chosen": -13.1875, + "logits/rejected": -13.0625, + "logps/chosen": -560.0, + "logps/rejected": -660.0, + "loss": 0.0999, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -11.25, + "rewards/margins": 7.9375, + "rewards/rejected": -19.25, + "step": 3820 + }, + { + "epoch": 2.0, + "eval_logits/chosen": -13.0, + "eval_logits/rejected": -13.0625, + "eval_logps/chosen": -604.0, + "eval_logps/rejected": -608.0, + "eval_loss": 0.8018984198570251, + "eval_rewards/accuracies": 0.69921875, + "eval_rewards/chosen": -14.125, + "eval_rewards/margins": 1.8125, + "eval_rewards/rejected": -15.9375, + "eval_runtime": 46.7492, + "eval_samples_per_second": 42.781, + "eval_steps_per_second": 0.685, + "step": 3822 + }, + { + "epoch": 2.004186289900576, + "grad_norm": 0.44069773718681493, + "learning_rate": 1.4989910148658325e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.375, + "logps/chosen": -584.0, + "logps/rejected": -720.0, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.3125, + "rewards/margins": 9.375, + "rewards/rejected": -20.625, + "step": 3830 + }, + { + "epoch": 2.009419152276295, + "grad_norm": 0.5769119803775666, + "learning_rate": 1.4850594597400352e-05, + "logits/chosen": -12.75, + "logits/rejected": -12.5625, + "logps/chosen": -592.0, + "logps/rejected": -744.0, + "loss": 0.0116, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.625, + "rewards/margins": 11.9375, + "rewards/rejected": -22.5, + "step": 3840 + }, + { + "epoch": 2.0146520146520146, + "grad_norm": 0.5107802441858381, + "learning_rate": 1.4711655410031538e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.5, + "logps/chosen": -532.0, + "logps/rejected": -684.0, + "loss": 0.0158, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.625, + "rewards/margins": 10.1875, + "rewards/rejected": -20.875, + "step": 3850 + }, + { + "epoch": 2.0198848770277342, + "grad_norm": 0.5560067424928957, + "learning_rate": 1.4573097738744623e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.3125, + "logps/chosen": -498.0, + "logps/rejected": -700.0, + "loss": 0.0208, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.6875, + "rewards/margins": 10.4375, + "rewards/rejected": -21.125, + "step": 3860 + }, + { + "epoch": 2.025117739403454, + "grad_norm": 0.691304079384525, + "learning_rate": 1.4434926721584865e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.875, + "logps/chosen": -532.0, + "logps/rejected": -704.0, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.3125, + "rewards/margins": 10.25, + "rewards/rejected": -20.625, + "step": 3870 + }, + { + "epoch": 2.030350601779173, + "grad_norm": 4.441051778595759, + "learning_rate": 1.4297147482259424e-05, + "logits/chosen": -13.25, + "logits/rejected": -12.5625, + "logps/chosen": -512.0, + "logps/rejected": -684.0, + "loss": 0.0155, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.3125, + "rewards/margins": 11.0, + "rewards/rejected": -21.375, + "step": 3880 + }, + { + "epoch": 2.0355834641548927, + "grad_norm": 0.2629076990932687, + "learning_rate": 1.4159765129947445e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.25, + "logps/chosen": -552.0, + "logps/rejected": -660.0, + "loss": 0.0257, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.3125, + "rewards/margins": 10.375, + "rewards/rejected": -20.75, + "step": 3890 + }, + { + "epoch": 2.0408163265306123, + "grad_norm": 2.621446854010498, + "learning_rate": 1.4022784759110577e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.6875, + "logps/chosen": -484.0, + "logps/rejected": -688.0, + "loss": 0.0371, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -10.3125, + "rewards/margins": 10.25, + "rewards/rejected": -20.5, + "step": 3900 + }, + { + "epoch": 2.046049188906332, + "grad_norm": 0.3214305037411537, + "learning_rate": 1.3886211449304005e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -508.0, + "logps/rejected": -704.0, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.125, + "rewards/margins": 9.375, + "rewards/rejected": -19.5, + "step": 3910 + }, + { + "epoch": 2.051282051282051, + "grad_norm": 0.26615797826258786, + "learning_rate": 1.3750050264988173e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.6875, + "logps/chosen": -472.0, + "logps/rejected": -676.0, + "loss": 0.0181, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.875, + "rewards/margins": 10.5625, + "rewards/rejected": -20.375, + "step": 3920 + }, + { + "epoch": 2.0565149136577707, + "grad_norm": 0.1629150297796315, + "learning_rate": 1.361430625534092e-05, + "logits/chosen": -13.25, + "logits/rejected": -13.125, + "logps/chosen": -560.0, + "logps/rejected": -720.0, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.125, + "rewards/margins": 12.4375, + "rewards/rejected": -22.5, + "step": 3930 + }, + { + "epoch": 2.0617477760334904, + "grad_norm": 0.4338741888563191, + "learning_rate": 1.3478984454070274e-05, + "logits/chosen": -13.3125, + "logits/rejected": -12.8125, + "logps/chosen": -492.0, + "logps/rejected": -752.0, + "loss": 0.0122, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.125, + "rewards/margins": 13.0, + "rewards/rejected": -22.125, + "step": 3940 + }, + { + "epoch": 2.06698063840921, + "grad_norm": 0.6659403344792019, + "learning_rate": 1.334408987922777e-05, + "logits/chosen": -13.3125, + "logits/rejected": -12.5, + "logps/chosen": -544.0, + "logps/rejected": -728.0, + "loss": 0.0203, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.375, + "rewards/margins": 11.8125, + "rewards/rejected": -22.125, + "step": 3950 + }, + { + "epoch": 2.072213500784929, + "grad_norm": 0.2383534684121332, + "learning_rate": 1.3209627533022396e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.25, + "logps/chosen": -532.0, + "logps/rejected": -744.0, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.375, + "rewards/margins": 12.25, + "rewards/rejected": -22.75, + "step": 3960 + }, + { + "epoch": 2.077446363160649, + "grad_norm": 0.17771246387777764, + "learning_rate": 1.3075602401635056e-05, + "logits/chosen": -13.1875, + "logits/rejected": -12.5, + "logps/chosen": -520.0, + "logps/rejected": -664.0, + "loss": 0.0137, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -11.0625, + "rewards/margins": 10.25, + "rewards/rejected": -21.25, + "step": 3970 + }, + { + "epoch": 2.0826792255363684, + "grad_norm": 0.329137920392516, + "learning_rate": 1.2942019455033715e-05, + "logits/chosen": -12.8125, + "logits/rejected": -12.125, + "logps/chosen": -580.0, + "logps/rejected": -800.0, + "loss": 0.0065, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.0625, + "rewards/margins": 13.125, + "rewards/rejected": -23.25, + "step": 3980 + }, + { + "epoch": 2.087912087912088, + "grad_norm": 0.13778616370169536, + "learning_rate": 1.2808883646789089e-05, + "logits/chosen": -13.5, + "logits/rejected": -12.875, + "logps/chosen": -496.0, + "logps/rejected": -688.0, + "loss": 0.0092, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.375, + "rewards/margins": 11.3125, + "rewards/rejected": -21.75, + "step": 3990 + }, + { + "epoch": 2.0931449502878072, + "grad_norm": 0.24055482258031696, + "learning_rate": 1.2676199913890935e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.75, + "logps/chosen": -524.0, + "logps/rejected": -736.0, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.9375, + "rewards/margins": 12.875, + "rewards/rejected": -22.75, + "step": 4000 + }, + { + "epoch": 2.098377812663527, + "grad_norm": 0.12739837504095347, + "learning_rate": 1.2543973176565014e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.1875, + "logps/chosen": -520.0, + "logps/rejected": -756.0, + "loss": 0.0184, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.8125, + "rewards/margins": 13.125, + "rewards/rejected": -24.0, + "step": 4010 + }, + { + "epoch": 2.1036106750392465, + "grad_norm": 0.11079988686594298, + "learning_rate": 1.2412208338090566e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.5, + "logps/chosen": -596.0, + "logps/rejected": -828.0, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.25, + "rewards/margins": 13.75, + "rewards/rejected": -24.0, + "step": 4020 + }, + { + "epoch": 2.108843537414966, + "grad_norm": 0.9326656347824286, + "learning_rate": 1.2280910284618583e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.5, + "logps/chosen": -528.0, + "logps/rejected": -752.0, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.25, + "rewards/margins": 13.5, + "rewards/rejected": -23.75, + "step": 4030 + }, + { + "epoch": 2.1140763997906853, + "grad_norm": 1.5759110453700604, + "learning_rate": 1.2150083884990538e-05, + "logits/chosen": -13.25, + "logits/rejected": -12.375, + "logps/chosen": -548.0, + "logps/rejected": -764.0, + "loss": 0.0156, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.25, + "rewards/margins": 11.75, + "rewards/rejected": -23.0, + "step": 4040 + }, + { + "epoch": 2.119309262166405, + "grad_norm": 1.1917503318443903, + "learning_rate": 1.201973399055788e-05, + "logits/chosen": -13.125, + "logits/rejected": -12.75, + "logps/chosen": -548.0, + "logps/rejected": -724.0, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.625, + "rewards/margins": 12.125, + "rewards/rejected": -21.75, + "step": 4050 + }, + { + "epoch": 2.1245421245421245, + "grad_norm": 3.1551012011885393, + "learning_rate": 1.1889865435002117e-05, + "logits/chosen": -13.5, + "logits/rejected": -12.75, + "logps/chosen": -528.0, + "logps/rejected": -724.0, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.5625, + "rewards/margins": 11.75, + "rewards/rejected": -22.375, + "step": 4060 + }, + { + "epoch": 2.129774986917844, + "grad_norm": 1.2341246483692032, + "learning_rate": 1.176048303415559e-05, + "logits/chosen": -12.625, + "logits/rejected": -12.1875, + "logps/chosen": -496.0, + "logps/rejected": -728.0, + "loss": 0.0164, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.375, + "rewards/margins": 12.1875, + "rewards/rejected": -22.5, + "step": 4070 + }, + { + "epoch": 2.1350078492935634, + "grad_norm": 0.12092505668455128, + "learning_rate": 1.1631591585822841e-05, + "logits/chosen": -13.1875, + "logits/rejected": -12.75, + "logps/chosen": -516.0, + "logps/rejected": -760.0, + "loss": 0.0248, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.0, + "rewards/margins": 12.75, + "rewards/rejected": -22.75, + "step": 4080 + }, + { + "epoch": 2.140240711669283, + "grad_norm": 0.9346832597811591, + "learning_rate": 1.1503195869602767e-05, + "logits/chosen": -13.0625, + "logits/rejected": -12.8125, + "logps/chosen": -552.0, + "logps/rejected": -728.0, + "loss": 0.0156, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -11.25, + "rewards/margins": 11.3125, + "rewards/rejected": -22.625, + "step": 4090 + }, + { + "epoch": 2.1454735740450026, + "grad_norm": 0.26958900501293714, + "learning_rate": 1.137530064671135e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.25, + "logps/chosen": -536.0, + "logps/rejected": -748.0, + "loss": 0.0172, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.75, + "rewards/margins": 11.9375, + "rewards/rejected": -22.75, + "step": 4100 + }, + { + "epoch": 2.1507064364207222, + "grad_norm": 1.2068543614594405, + "learning_rate": 1.1247910659805064e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.4375, + "logps/chosen": -532.0, + "logps/rejected": -672.0, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.4375, + "rewards/margins": 12.0625, + "rewards/rejected": -21.5, + "step": 4110 + }, + { + "epoch": 2.155939298796442, + "grad_norm": 0.07230196530823124, + "learning_rate": 1.112103063280509e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.375, + "logps/chosen": -548.0, + "logps/rejected": -804.0, + "loss": 0.0082, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.5625, + "rewards/margins": 13.125, + "rewards/rejected": -22.75, + "step": 4120 + }, + { + "epoch": 2.161172161172161, + "grad_norm": 0.9441548421966137, + "learning_rate": 1.0994665270722071e-05, + "logits/chosen": -13.1875, + "logits/rejected": -12.6875, + "logps/chosen": -484.0, + "logps/rejected": -756.0, + "loss": 0.0116, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.8125, + "rewards/margins": 12.5, + "rewards/rejected": -22.375, + "step": 4130 + }, + { + "epoch": 2.1664050235478807, + "grad_norm": 1.6049171624147622, + "learning_rate": 1.0868819259481639e-05, + "logits/chosen": -12.875, + "logits/rejected": -12.5625, + "logps/chosen": -560.0, + "logps/rejected": -720.0, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.5, + "rewards/margins": 11.0, + "rewards/rejected": -22.5, + "step": 4140 + }, + { + "epoch": 2.1716378859236003, + "grad_norm": 0.05234835972167443, + "learning_rate": 1.0743497265750702e-05, + "logits/chosen": -13.25, + "logits/rejected": -12.375, + "logps/chosen": -544.0, + "logps/rejected": -724.0, + "loss": 0.0189, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.875, + "rewards/margins": 11.5625, + "rewards/rejected": -22.375, + "step": 4150 + }, + { + "epoch": 2.17687074829932, + "grad_norm": 0.1969413294246078, + "learning_rate": 1.061870393676436e-05, + "logits/chosen": -12.9375, + "logits/rejected": -12.75, + "logps/chosen": -556.0, + "logps/rejected": -804.0, + "loss": 0.0207, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.75, + "rewards/margins": 13.5625, + "rewards/rejected": -23.25, + "step": 4160 + }, + { + "epoch": 2.182103610675039, + "grad_norm": 0.6075595881302668, + "learning_rate": 1.0494443900153558e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.75, + "logps/chosen": -576.0, + "logps/rejected": -752.0, + "loss": 0.0286, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.625, + "rewards/margins": 11.625, + "rewards/rejected": -22.25, + "step": 4170 + }, + { + "epoch": 2.1873364730507587, + "grad_norm": 0.06443679481327427, + "learning_rate": 1.0370721763773508e-05, + "logits/chosen": -13.25, + "logits/rejected": -12.75, + "logps/chosen": -548.0, + "logps/rejected": -752.0, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.25, + "rewards/margins": 12.125, + "rewards/rejected": -22.375, + "step": 4180 + }, + { + "epoch": 2.1925693354264784, + "grad_norm": 1.122736909133277, + "learning_rate": 1.0247542115532846e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.75, + "logps/chosen": -510.0, + "logps/rejected": -728.0, + "loss": 0.0396, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.625, + "rewards/margins": 12.625, + "rewards/rejected": -22.25, + "step": 4190 + }, + { + "epoch": 2.197802197802198, + "grad_norm": 0.6888171252410781, + "learning_rate": 1.0124909523223419e-05, + "logits/chosen": -13.0, + "logits/rejected": -12.5, + "logps/chosen": -564.0, + "logps/rejected": -760.0, + "loss": 0.0267, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.875, + "rewards/margins": 12.625, + "rewards/rejected": -22.5, + "step": 4200 + }, + { + "epoch": 2.203035060177917, + "grad_norm": 0.19411528574032383, + "learning_rate": 1.0002828534350989e-05, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -552.0, + "logps/rejected": -724.0, + "loss": 0.0172, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.875, + "rewards/margins": 11.75, + "rewards/rejected": -21.625, + "step": 4210 + }, + { + "epoch": 2.208267922553637, + "grad_norm": 0.18615879946631778, + "learning_rate": 9.881303675966525e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.375, + "logps/chosen": -544.0, + "logps/rejected": -736.0, + "loss": 0.0052, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.5625, + "rewards/margins": 12.1875, + "rewards/rejected": -22.75, + "step": 4220 + }, + { + "epoch": 2.2135007849293564, + "grad_norm": 0.06276505039432473, + "learning_rate": 9.760339454498393e-06, + "logits/chosen": -13.0625, + "logits/rejected": -12.3125, + "logps/chosen": -498.0, + "logps/rejected": -720.0, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.25, + "rewards/margins": 12.0625, + "rewards/rejected": -22.375, + "step": 4230 + }, + { + "epoch": 2.218733647305076, + "grad_norm": 0.2906987272091601, + "learning_rate": 9.639940355585219e-06, + "logits/chosen": -13.0, + "logits/rejected": -12.625, + "logps/chosen": -584.0, + "logps/rejected": -760.0, + "loss": 0.0286, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -11.125, + "rewards/margins": 11.3125, + "rewards/rejected": -22.375, + "step": 4240 + }, + { + "epoch": 2.2239665096807952, + "grad_norm": 1.8677189187726966, + "learning_rate": 9.520110843909542e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.3125, + "logps/chosen": -524.0, + "logps/rejected": -700.0, + "loss": 0.0168, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.0, + "rewards/margins": 10.6875, + "rewards/rejected": -20.625, + "step": 4250 + }, + { + "epoch": 2.229199372056515, + "grad_norm": 0.5031535847376246, + "learning_rate": 9.400855363032262e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.5, + "logps/chosen": -544.0, + "logps/rejected": -764.0, + "loss": 0.0108, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.75, + "rewards/margins": 12.8125, + "rewards/rejected": -22.5, + "step": 4260 + }, + { + "epoch": 2.2344322344322345, + "grad_norm": 0.3299828837281531, + "learning_rate": 9.282178335227884e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.1875, + "logps/chosen": -502.0, + "logps/rejected": -772.0, + "loss": 0.0247, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.75, + "rewards/margins": 13.0, + "rewards/rejected": -22.75, + "step": 4270 + }, + { + "epoch": 2.239665096807954, + "grad_norm": 0.8321963118058464, + "learning_rate": 9.164084161320471e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.625, + "logps/chosen": -506.0, + "logps/rejected": -740.0, + "loss": 0.017, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.125, + "rewards/margins": 13.625, + "rewards/rejected": -22.75, + "step": 4280 + }, + { + "epoch": 2.2448979591836733, + "grad_norm": 0.6943875876177056, + "learning_rate": 9.04657722052052e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.8125, + "logps/chosen": -556.0, + "logps/rejected": -764.0, + "loss": 0.0127, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.5, + "rewards/margins": 13.25, + "rewards/rejected": -22.75, + "step": 4290 + }, + { + "epoch": 2.250130821559393, + "grad_norm": 6.799804412241437, + "learning_rate": 8.929661870262526e-06, + "logits/chosen": -12.3125, + "logits/rejected": -12.0, + "logps/chosen": -616.0, + "logps/rejected": -816.0, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.4375, + "rewards/margins": 13.0625, + "rewards/rejected": -23.5, + "step": 4300 + }, + { + "epoch": 2.2553636839351126, + "grad_norm": 0.19463379281176907, + "learning_rate": 8.813342446043424e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.375, + "logps/chosen": -512.0, + "logps/rejected": -752.0, + "loss": 0.0206, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.6875, + "rewards/margins": 12.0625, + "rewards/rejected": -21.75, + "step": 4310 + }, + { + "epoch": 2.260596546310832, + "grad_norm": 0.2697052020128489, + "learning_rate": 8.697623261261789e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.3125, + "logps/chosen": -528.0, + "logps/rejected": -764.0, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.125, + "rewards/margins": 12.8125, + "rewards/rejected": -22.875, + "step": 4320 + }, + { + "epoch": 2.2658294086865514, + "grad_norm": 0.40883057591483557, + "learning_rate": 8.58250860705792e-06, + "logits/chosen": -12.5, + "logits/rejected": -12.75, + "logps/chosen": -588.0, + "logps/rejected": -700.0, + "loss": 0.0161, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.75, + "rewards/margins": 10.25, + "rewards/rejected": -21.0, + "step": 4330 + }, + { + "epoch": 2.271062271062271, + "grad_norm": 0.1423644882664783, + "learning_rate": 8.468002752154672e-06, + "logits/chosen": -12.375, + "logits/rejected": -12.4375, + "logps/chosen": -540.0, + "logps/rejected": -716.0, + "loss": 0.0173, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.0, + "rewards/margins": 11.8125, + "rewards/rejected": -21.75, + "step": 4340 + }, + { + "epoch": 2.2762951334379906, + "grad_norm": 0.2527420977259, + "learning_rate": 8.35410994269921e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.5625, + "logps/chosen": -612.0, + "logps/rejected": -756.0, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.5625, + "rewards/margins": 11.8125, + "rewards/rejected": -22.375, + "step": 4350 + }, + { + "epoch": 2.2815279958137102, + "grad_norm": 0.9800221783762282, + "learning_rate": 8.240834402105524e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.25, + "logps/chosen": -532.0, + "logps/rejected": -736.0, + "loss": 0.0144, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.375, + "rewards/margins": 11.875, + "rewards/rejected": -21.25, + "step": 4360 + }, + { + "epoch": 2.2867608581894294, + "grad_norm": 0.5775266185707517, + "learning_rate": 8.128180330897791e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.1875, + "logps/chosen": -498.0, + "logps/rejected": -752.0, + "loss": 0.0138, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -8.6875, + "rewards/margins": 13.0625, + "rewards/rejected": -21.75, + "step": 4370 + }, + { + "epoch": 2.291993720565149, + "grad_norm": 0.10327636037785584, + "learning_rate": 8.016151906554683e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.4375, + "logps/chosen": -516.0, + "logps/rejected": -772.0, + "loss": 0.0129, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.125, + "rewards/margins": 12.0625, + "rewards/rejected": -22.125, + "step": 4380 + }, + { + "epoch": 2.2972265829408687, + "grad_norm": 0.1282213740055652, + "learning_rate": 7.90475328335439e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.6875, + "logps/chosen": -548.0, + "logps/rejected": -700.0, + "loss": 0.015, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.5, + "rewards/margins": 12.0625, + "rewards/rejected": -21.625, + "step": 4390 + }, + { + "epoch": 2.3024594453165883, + "grad_norm": 0.05266887192828813, + "learning_rate": 7.793988592220569e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.25, + "logps/chosen": -488.0, + "logps/rejected": -640.0, + "loss": 0.018, + "rewards/accuracies": 0.9375, + "rewards/chosen": -10.0, + "rewards/margins": 10.125, + "rewards/rejected": -20.125, + "step": 4400 + }, + { + "epoch": 2.3076923076923075, + "grad_norm": 0.04530104525562387, + "learning_rate": 7.683861940569218e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.1875, + "logps/chosen": -584.0, + "logps/rejected": -728.0, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.375, + "rewards/margins": 10.875, + "rewards/rejected": -22.25, + "step": 4410 + }, + { + "epoch": 2.312925170068027, + "grad_norm": 0.20466612976766344, + "learning_rate": 7.574377412156292e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.375, + "logps/chosen": -494.0, + "logps/rejected": -664.0, + "loss": 0.0206, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -9.9375, + "rewards/margins": 11.0625, + "rewards/rejected": -21.0, + "step": 4420 + }, + { + "epoch": 2.3181580324437467, + "grad_norm": 1.081779518073677, + "learning_rate": 7.465539066926322e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.6875, + "logps/chosen": -520.0, + "logps/rejected": -712.0, + "loss": 0.017, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.4375, + "rewards/margins": 12.375, + "rewards/rejected": -21.75, + "step": 4430 + }, + { + "epoch": 2.3233908948194664, + "grad_norm": 0.24020374605900488, + "learning_rate": 7.357350940861845e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.375, + "logps/chosen": -576.0, + "logps/rejected": -808.0, + "loss": 0.0057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.625, + "rewards/margins": 12.9375, + "rewards/rejected": -22.625, + "step": 4440 + }, + { + "epoch": 2.328623757195186, + "grad_norm": 0.21378756606796173, + "learning_rate": 7.249817045833726e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.9375, + "logps/chosen": -528.0, + "logps/rejected": -728.0, + "loss": 0.014, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.3125, + "rewards/margins": 13.0, + "rewards/rejected": -22.375, + "step": 4450 + }, + { + "epoch": 2.333856619570905, + "grad_norm": 0.06488277056020401, + "learning_rate": 7.142941369452411e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.5, + "logps/chosen": -512.0, + "logps/rejected": -712.0, + "loss": 0.0154, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.125, + "rewards/margins": 10.875, + "rewards/rejected": -21.0, + "step": 4460 + }, + { + "epoch": 2.339089481946625, + "grad_norm": 0.7536243870528055, + "learning_rate": 7.036727874920043e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.0625, + "logps/chosen": -508.0, + "logps/rejected": -764.0, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.3125, + "rewards/margins": 13.5625, + "rewards/rejected": -22.875, + "step": 4470 + }, + { + "epoch": 2.3443223443223444, + "grad_norm": 0.1250439583457857, + "learning_rate": 6.931180500883486e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.3125, + "logps/chosen": -470.0, + "logps/rejected": -676.0, + "loss": 0.0086, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.0, + "rewards/margins": 11.0625, + "rewards/rejected": -21.125, + "step": 4480 + }, + { + "epoch": 2.3495552066980636, + "grad_norm": 0.0633273759969605, + "learning_rate": 6.826303161288303e-06, + "logits/chosen": -12.5, + "logits/rejected": -12.25, + "logps/chosen": -500.0, + "logps/rejected": -728.0, + "loss": 0.0123, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.8125, + "rewards/margins": 13.0, + "rewards/rejected": -22.875, + "step": 4490 + }, + { + "epoch": 2.3547880690737832, + "grad_norm": 0.07953697544602377, + "learning_rate": 6.722099745233595e-06, + "logits/chosen": -12.5, + "logits/rejected": -12.1875, + "logps/chosen": -596.0, + "logps/rejected": -756.0, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.0, + "rewards/margins": 12.125, + "rewards/rejected": -23.125, + "step": 4500 + }, + { + "epoch": 2.360020931449503, + "grad_norm": 0.100823164008195, + "learning_rate": 6.618574116827786e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.375, + "logps/chosen": -520.0, + "logps/rejected": -764.0, + "loss": 0.0122, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.6875, + "rewards/margins": 13.1875, + "rewards/rejected": -22.875, + "step": 4510 + }, + { + "epoch": 2.3652537938252225, + "grad_norm": 0.04037595387640658, + "learning_rate": 6.51573011504534e-06, + "logits/chosen": -12.5, + "logits/rejected": -12.3125, + "logps/chosen": -640.0, + "logps/rejected": -848.0, + "loss": 0.0137, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.1875, + "rewards/margins": 13.6875, + "rewards/rejected": -24.875, + "step": 4520 + }, + { + "epoch": 2.370486656200942, + "grad_norm": 0.041466180476313466, + "learning_rate": 6.4135715535844e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.8125, + "logps/chosen": -528.0, + "logps/rejected": -724.0, + "loss": 0.0121, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.5625, + "rewards/margins": 13.0, + "rewards/rejected": -22.625, + "step": 4530 + }, + { + "epoch": 2.3757195185766613, + "grad_norm": 2.2267009793422337, + "learning_rate": 6.312102220725347e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.375, + "logps/chosen": -568.0, + "logps/rejected": -792.0, + "loss": 0.0161, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.25, + "rewards/margins": 13.875, + "rewards/rejected": -24.125, + "step": 4540 + }, + { + "epoch": 2.380952380952381, + "grad_norm": 3.6750631389711232, + "learning_rate": 6.21132587919036e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.375, + "logps/chosen": -548.0, + "logps/rejected": -756.0, + "loss": 0.0146, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.125, + "rewards/margins": 12.8125, + "rewards/rejected": -22.875, + "step": 4550 + }, + { + "epoch": 2.3861852433281006, + "grad_norm": 0.0988592319744124, + "learning_rate": 6.111246266003859e-06, + "logits/chosen": -12.3125, + "logits/rejected": -12.0, + "logps/chosen": -560.0, + "logps/rejected": -820.0, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.4375, + "rewards/margins": 13.25, + "rewards/rejected": -23.625, + "step": 4560 + }, + { + "epoch": 2.3914181057038197, + "grad_norm": 2.33950433690266, + "learning_rate": 6.011867092353934e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.5625, + "logps/chosen": -552.0, + "logps/rejected": -756.0, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.5625, + "rewards/margins": 13.375, + "rewards/rejected": -23.0, + "step": 4570 + }, + { + "epoch": 2.3966509680795394, + "grad_norm": 0.0681730768269089, + "learning_rate": 5.913192043454724e-06, + "logits/chosen": -12.3125, + "logits/rejected": -12.125, + "logps/chosen": -520.0, + "logps/rejected": -768.0, + "loss": 0.0249, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.125, + "rewards/margins": 12.375, + "rewards/rejected": -22.5, + "step": 4580 + }, + { + "epoch": 2.401883830455259, + "grad_norm": 0.42266101198548967, + "learning_rate": 5.815224778409767e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.5, + "logps/chosen": -560.0, + "logps/rejected": -772.0, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.5, + "rewards/margins": 12.75, + "rewards/rejected": -23.25, + "step": 4590 + }, + { + "epoch": 2.4071166928309786, + "grad_norm": 0.3853290474720451, + "learning_rate": 5.71796893007629e-06, + "logits/chosen": -12.375, + "logits/rejected": -11.875, + "logps/chosen": -496.0, + "logps/rejected": -760.0, + "loss": 0.0072, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.625, + "rewards/margins": 13.75, + "rewards/rejected": -23.375, + "step": 4600 + }, + { + "epoch": 2.4123495552066982, + "grad_norm": 0.10250705226256952, + "learning_rate": 5.621428104930529e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.1875, + "logps/chosen": -494.0, + "logps/rejected": -728.0, + "loss": 0.0237, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -11.25, + "rewards/margins": 12.375, + "rewards/rejected": -23.625, + "step": 4610 + }, + { + "epoch": 2.4175824175824174, + "grad_norm": 0.13828507154871758, + "learning_rate": 5.525605882933965e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.4375, + "logps/chosen": -540.0, + "logps/rejected": -804.0, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.0625, + "rewards/margins": 12.5625, + "rewards/rejected": -22.75, + "step": 4620 + }, + { + "epoch": 2.422815279958137, + "grad_norm": 5.641455891192051, + "learning_rate": 5.430505817400586e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.75, + "logps/chosen": -576.0, + "logps/rejected": -816.0, + "loss": 0.0214, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.625, + "rewards/margins": 14.125, + "rewards/rejected": -23.75, + "step": 4630 + }, + { + "epoch": 2.4280481423338567, + "grad_norm": 0.42243425995189116, + "learning_rate": 5.33613143486511e-06, + "logits/chosen": -12.5, + "logits/rejected": -12.4375, + "logps/chosen": -548.0, + "logps/rejected": -728.0, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.0625, + "rewards/margins": 13.5625, + "rewards/rejected": -22.625, + "step": 4640 + }, + { + "epoch": 2.4332810047095763, + "grad_norm": 1.4777844624098502, + "learning_rate": 5.2424862349522065e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.25, + "logps/chosen": -548.0, + "logps/rejected": -736.0, + "loss": 0.0204, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.3125, + "rewards/margins": 12.625, + "rewards/rejected": -23.0, + "step": 4650 + }, + { + "epoch": 2.4385138670852955, + "grad_norm": 0.05115813621262884, + "learning_rate": 5.149573690246759e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.0, + "logps/chosen": -580.0, + "logps/rejected": -772.0, + "loss": 0.0106, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.75, + "rewards/margins": 12.5625, + "rewards/rejected": -23.375, + "step": 4660 + }, + { + "epoch": 2.443746729461015, + "grad_norm": 0.06549014862611358, + "learning_rate": 5.0573972461650524e-06, + "logits/chosen": -12.25, + "logits/rejected": -12.25, + "logps/chosen": -636.0, + "logps/rejected": -824.0, + "loss": 0.0133, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -11.25, + "rewards/margins": 13.25, + "rewards/rejected": -24.5, + "step": 4670 + }, + { + "epoch": 2.4489795918367347, + "grad_norm": 0.17158367161151364, + "learning_rate": 4.965960320827018e-06, + "logits/chosen": -12.375, + "logits/rejected": -12.1875, + "logps/chosen": -608.0, + "logps/rejected": -784.0, + "loss": 0.0089, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.5, + "rewards/margins": 12.375, + "rewards/rejected": -22.875, + "step": 4680 + }, + { + "epoch": 2.4542124542124544, + "grad_norm": 0.08224298980083439, + "learning_rate": 4.875266304929496e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.1875, + "logps/chosen": -528.0, + "logps/rejected": -712.0, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.5625, + "rewards/margins": 11.75, + "rewards/rejected": -22.25, + "step": 4690 + }, + { + "epoch": 2.4594453165881736, + "grad_norm": 2.1347869965151687, + "learning_rate": 4.7853185616205105e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.3125, + "logps/chosen": -536.0, + "logps/rejected": -756.0, + "loss": 0.0123, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.75, + "rewards/margins": 11.6875, + "rewards/rejected": -22.375, + "step": 4700 + }, + { + "epoch": 2.464678178963893, + "grad_norm": 0.9619277161341164, + "learning_rate": 4.696120426374504e-06, + "logits/chosen": -12.4375, + "logits/rejected": -12.25, + "logps/chosen": -516.0, + "logps/rejected": -780.0, + "loss": 0.011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.8125, + "rewards/margins": 15.25, + "rewards/rejected": -25.0, + "step": 4710 + }, + { + "epoch": 2.469911041339613, + "grad_norm": 1.6743219521030817, + "learning_rate": 4.607675206868706e-06, + "logits/chosen": -12.5, + "logits/rejected": -12.3125, + "logps/chosen": -490.0, + "logps/rejected": -668.0, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.3125, + "rewards/margins": 11.4375, + "rewards/rejected": -21.75, + "step": 4720 + }, + { + "epoch": 2.4751439037153324, + "grad_norm": 0.1333846845539627, + "learning_rate": 4.5199861828604525e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.5625, + "logps/chosen": -532.0, + "logps/rejected": -752.0, + "loss": 0.0232, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.875, + "rewards/margins": 13.125, + "rewards/rejected": -23.0, + "step": 4730 + }, + { + "epoch": 2.4803767660910516, + "grad_norm": 0.06235123879594944, + "learning_rate": 4.433056606065553e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.375, + "logps/chosen": -512.0, + "logps/rejected": -740.0, + "loss": 0.0108, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.75, + "rewards/margins": 13.4375, + "rewards/rejected": -23.25, + "step": 4740 + }, + { + "epoch": 2.4856096284667712, + "grad_norm": 0.026777284454816707, + "learning_rate": 4.346889700037743e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.625, + "logps/chosen": -478.0, + "logps/rejected": -720.0, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.4375, + "rewards/margins": 13.5, + "rewards/rejected": -23.0, + "step": 4750 + }, + { + "epoch": 2.490842490842491, + "grad_norm": 0.6344174994119484, + "learning_rate": 4.261488660049112e-06, + "logits/chosen": -13.125, + "logits/rejected": -12.4375, + "logps/chosen": -520.0, + "logps/rejected": -816.0, + "loss": 0.0102, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.75, + "rewards/margins": 15.375, + "rewards/rejected": -25.125, + "step": 4760 + }, + { + "epoch": 2.4960753532182105, + "grad_norm": 4.869230110057092, + "learning_rate": 4.176856652971642e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.625, + "logps/chosen": -508.0, + "logps/rejected": -696.0, + "loss": 0.0234, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.9375, + "rewards/margins": 12.0625, + "rewards/rejected": -22.0, + "step": 4770 + }, + { + "epoch": 2.50130821559393, + "grad_norm": 0.12396452174190684, + "learning_rate": 4.092996817159752e-06, + "logits/chosen": -12.875, + "logits/rejected": -13.0, + "logps/chosen": -512.0, + "logps/rejected": -700.0, + "loss": 0.0056, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.875, + "rewards/margins": 12.5, + "rewards/rejected": -22.375, + "step": 4780 + }, + { + "epoch": 2.5065410779696493, + "grad_norm": 0.706327597363468, + "learning_rate": 4.009912262333942e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.5625, + "logps/chosen": -536.0, + "logps/rejected": -772.0, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.375, + "rewards/margins": 13.1875, + "rewards/rejected": -23.5, + "step": 4790 + }, + { + "epoch": 2.511773940345369, + "grad_norm": 0.7436869258928149, + "learning_rate": 3.927606069465442e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.4375, + "logps/chosen": -564.0, + "logps/rejected": -784.0, + "loss": 0.0114, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.8125, + "rewards/margins": 14.5, + "rewards/rejected": -24.375, + "step": 4800 + }, + { + "epoch": 2.5170068027210886, + "grad_norm": 1.309773829889326, + "learning_rate": 3.8460812906620045e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.625, + "logps/chosen": -588.0, + "logps/rejected": -760.0, + "loss": 0.012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.5, + "rewards/margins": 11.75, + "rewards/rejected": -22.25, + "step": 4810 + }, + { + "epoch": 2.5222396650968077, + "grad_norm": 0.3227734134583067, + "learning_rate": 3.7653409490546963e-06, + "logits/chosen": -12.5, + "logits/rejected": -12.6875, + "logps/chosen": -540.0, + "logps/rejected": -740.0, + "loss": 0.0197, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.6875, + "rewards/margins": 13.125, + "rewards/rejected": -22.875, + "step": 4820 + }, + { + "epoch": 2.5274725274725274, + "grad_norm": 0.2844434389328289, + "learning_rate": 3.6853880386858107e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.5, + "logps/chosen": -532.0, + "logps/rejected": -760.0, + "loss": 0.0163, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.0, + "rewards/margins": 13.25, + "rewards/rejected": -22.25, + "step": 4830 + }, + { + "epoch": 2.532705389848247, + "grad_norm": 1.4407739685644243, + "learning_rate": 3.60622552439783e-06, + "logits/chosen": -12.4375, + "logits/rejected": -12.125, + "logps/chosen": -492.0, + "logps/rejected": -720.0, + "loss": 0.0212, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.0625, + "rewards/margins": 13.625, + "rewards/rejected": -22.75, + "step": 4840 + }, + { + "epoch": 2.5379382522239666, + "grad_norm": 0.23520810629298425, + "learning_rate": 3.527856341723479e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.6875, + "logps/chosen": -524.0, + "logps/rejected": -756.0, + "loss": 0.012, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.5, + "rewards/margins": 12.4375, + "rewards/rejected": -23.0, + "step": 4850 + }, + { + "epoch": 2.5431711145996863, + "grad_norm": 0.49363019070075204, + "learning_rate": 3.4502833967768822e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.5, + "logps/chosen": -560.0, + "logps/rejected": -756.0, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.5625, + "rewards/margins": 13.625, + "rewards/rejected": -23.25, + "step": 4860 + }, + { + "epoch": 2.5484039769754054, + "grad_norm": 0.06056475449691753, + "learning_rate": 3.373509566145794e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.8125, + "logps/chosen": -556.0, + "logps/rejected": -728.0, + "loss": 0.0294, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.0625, + "rewards/margins": 12.25, + "rewards/rejected": -22.375, + "step": 4870 + }, + { + "epoch": 2.553636839351125, + "grad_norm": 4.573310943983801, + "learning_rate": 3.297537696784911e-06, + "logits/chosen": -13.0625, + "logits/rejected": -12.6875, + "logps/chosen": -490.0, + "logps/rejected": -760.0, + "loss": 0.0225, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.1875, + "rewards/margins": 12.5, + "rewards/rejected": -22.75, + "step": 4880 + }, + { + "epoch": 2.5588697017268447, + "grad_norm": 0.06128290637528687, + "learning_rate": 3.2223706059103324e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.375, + "logps/chosen": -564.0, + "logps/rejected": -744.0, + "loss": 0.0115, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.1875, + "rewards/margins": 12.3125, + "rewards/rejected": -22.5, + "step": 4890 + }, + { + "epoch": 2.564102564102564, + "grad_norm": 0.12260215298875937, + "learning_rate": 3.1480110808950747e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.25, + "logps/chosen": -532.0, + "logps/rejected": -780.0, + "loss": 0.0115, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.1875, + "rewards/margins": 13.0625, + "rewards/rejected": -23.25, + "step": 4900 + }, + { + "epoch": 2.5693354264782835, + "grad_norm": 0.08368072058556565, + "learning_rate": 3.07446187916568e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.5625, + "logps/chosen": -552.0, + "logps/rejected": -756.0, + "loss": 0.0115, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.0, + "rewards/margins": 12.0625, + "rewards/rejected": -22.0, + "step": 4910 + }, + { + "epoch": 2.574568288854003, + "grad_norm": 2.918695964115184, + "learning_rate": 3.0017257281000216e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.875, + "logps/chosen": -568.0, + "logps/rejected": -744.0, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.5, + "rewards/margins": 12.75, + "rewards/rejected": -22.25, + "step": 4920 + }, + { + "epoch": 2.5798011512297228, + "grad_norm": 0.06527695856423694, + "learning_rate": 2.9298053249261244e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.625, + "logps/chosen": -468.0, + "logps/rejected": -752.0, + "loss": 0.0091, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.0625, + "rewards/margins": 14.0, + "rewards/rejected": -23.0, + "step": 4930 + }, + { + "epoch": 2.5850340136054424, + "grad_norm": 0.46980953722988, + "learning_rate": 2.858703336622154e-06, + "logits/chosen": -13.0625, + "logits/rejected": -12.75, + "logps/chosen": -480.0, + "logps/rejected": -720.0, + "loss": 0.0525, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.6875, + "rewards/margins": 12.3125, + "rewards/rejected": -22.0, + "step": 4940 + }, + { + "epoch": 2.5902668759811616, + "grad_norm": 0.28837857550195123, + "learning_rate": 2.788422399817525e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -512.0, + "logps/rejected": -812.0, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.625, + "rewards/margins": 13.5, + "rewards/rejected": -24.125, + "step": 4950 + }, + { + "epoch": 2.595499738356881, + "grad_norm": 0.06281973988927571, + "learning_rate": 2.718965120695141e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.4375, + "logps/chosen": -552.0, + "logps/rejected": -776.0, + "loss": 0.0185, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.375, + "rewards/margins": 11.6875, + "rewards/rejected": -22.0, + "step": 4960 + }, + { + "epoch": 2.600732600732601, + "grad_norm": 0.5587610901639003, + "learning_rate": 2.6503340748947086e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.5, + "logps/chosen": -548.0, + "logps/rejected": -852.0, + "loss": 0.0219, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.4375, + "rewards/margins": 14.8125, + "rewards/rejected": -24.25, + "step": 4970 + }, + { + "epoch": 2.60596546310832, + "grad_norm": 0.03319417097882791, + "learning_rate": 2.5825318074172765e-06, + "logits/chosen": -12.25, + "logits/rejected": -12.5625, + "logps/chosen": -536.0, + "logps/rejected": -776.0, + "loss": 0.0131, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.125, + "rewards/margins": 13.0625, + "rewards/rejected": -23.25, + "step": 4980 + }, + { + "epoch": 2.6111983254840396, + "grad_norm": 0.11213587678104356, + "learning_rate": 2.515560832530836e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.5625, + "logps/chosen": -490.0, + "logps/rejected": -784.0, + "loss": 0.0144, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -8.75, + "rewards/margins": 16.0, + "rewards/rejected": -24.75, + "step": 4990 + }, + { + "epoch": 2.6164311878597593, + "grad_norm": 0.09685923373345516, + "learning_rate": 2.4494236336770697e-06, + "logits/chosen": -12.5, + "logits/rejected": -12.3125, + "logps/chosen": -510.0, + "logps/rejected": -740.0, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.8125, + "rewards/margins": 12.375, + "rewards/rejected": -22.125, + "step": 5000 + }, + { + "epoch": 2.621664050235479, + "grad_norm": 2.993062977560315, + "learning_rate": 2.3841226633792983e-06, + "logits/chosen": -12.3125, + "logits/rejected": -12.125, + "logps/chosen": -588.0, + "logps/rejected": -756.0, + "loss": 0.0113, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.375, + "rewards/margins": 12.9375, + "rewards/rejected": -23.25, + "step": 5010 + }, + { + "epoch": 2.6268969126111985, + "grad_norm": 0.2803893599147877, + "learning_rate": 2.319660343151511e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.75, + "logps/chosen": -536.0, + "logps/rejected": -704.0, + "loss": 0.0262, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.625, + "rewards/margins": 11.0625, + "rewards/rejected": -21.75, + "step": 5020 + }, + { + "epoch": 2.6321297749869177, + "grad_norm": 0.19574905752849175, + "learning_rate": 2.2560390634085714e-06, + "logits/chosen": -12.9375, + "logits/rejected": -12.75, + "logps/chosen": -494.0, + "logps/rejected": -764.0, + "loss": 0.0152, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.875, + "rewards/margins": 12.875, + "rewards/rejected": -22.75, + "step": 5030 + }, + { + "epoch": 2.6373626373626373, + "grad_norm": 0.11308648254691579, + "learning_rate": 2.1932611833775846e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.625, + "logps/chosen": -556.0, + "logps/rejected": -772.0, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.1875, + "rewards/margins": 14.125, + "rewards/rejected": -24.25, + "step": 5040 + }, + { + "epoch": 2.642595499738357, + "grad_norm": 0.19180938005745587, + "learning_rate": 2.13132903101039e-06, + "logits/chosen": -13.0, + "logits/rejected": -12.5625, + "logps/chosen": -496.0, + "logps/rejected": -744.0, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.9375, + "rewards/margins": 13.0625, + "rewards/rejected": -23.0, + "step": 5050 + }, + { + "epoch": 2.647828362114076, + "grad_norm": 0.0824809238337483, + "learning_rate": 2.0702449028972698e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.5625, + "logps/chosen": -528.0, + "logps/rejected": -776.0, + "loss": 0.0164, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.625, + "rewards/margins": 14.3125, + "rewards/rejected": -24.0, + "step": 5060 + }, + { + "epoch": 2.6530612244897958, + "grad_norm": 0.19257105032977728, + "learning_rate": 2.0100110641817548e-06, + "logits/chosen": -12.5625, + "logits/rejected": -12.125, + "logps/chosen": -532.0, + "logps/rejected": -752.0, + "loss": 0.0133, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -11.25, + "rewards/margins": 11.5625, + "rewards/rejected": -22.75, + "step": 5070 + }, + { + "epoch": 2.6582940868655154, + "grad_norm": 0.6376462030458858, + "learning_rate": 1.9506297484766427e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.9375, + "logps/chosen": -564.0, + "logps/rejected": -744.0, + "loss": 0.01, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.6875, + "rewards/margins": 14.0625, + "rewards/rejected": -23.75, + "step": 5080 + }, + { + "epoch": 2.663526949241235, + "grad_norm": 1.2368487640960626, + "learning_rate": 1.8921031577811693e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.5625, + "logps/chosen": -556.0, + "logps/rejected": -752.0, + "loss": 0.0122, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.375, + "rewards/margins": 11.8125, + "rewards/rejected": -22.25, + "step": 5090 + }, + { + "epoch": 2.6687598116169546, + "grad_norm": 0.2769381517332766, + "learning_rate": 1.8344334623993515e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.5, + "logps/chosen": -520.0, + "logps/rejected": -760.0, + "loss": 0.0109, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.0625, + "rewards/margins": 13.375, + "rewards/rejected": -23.5, + "step": 5100 + }, + { + "epoch": 2.6739926739926743, + "grad_norm": 0.10428401411066154, + "learning_rate": 1.7776228008594965e-06, + "logits/chosen": -13.0, + "logits/rejected": -12.875, + "logps/chosen": -560.0, + "logps/rejected": -828.0, + "loss": 0.0099, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.9375, + "rewards/margins": 14.6875, + "rewards/rejected": -24.625, + "step": 5110 + }, + { + "epoch": 2.6792255363683934, + "grad_norm": 0.5981436652803144, + "learning_rate": 1.721673279834926e-06, + "logits/chosen": -13.0, + "logits/rejected": -12.1875, + "logps/chosen": -520.0, + "logps/rejected": -720.0, + "loss": 0.011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -11.25, + "rewards/margins": 11.1875, + "rewards/rejected": -22.375, + "step": 5120 + }, + { + "epoch": 2.684458398744113, + "grad_norm": 0.03180320517493719, + "learning_rate": 1.6665869740658312e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.625, + "logps/chosen": -608.0, + "logps/rejected": -868.0, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.0, + "rewards/margins": 15.375, + "rewards/rejected": -25.375, + "step": 5130 + }, + { + "epoch": 2.6896912611198327, + "grad_norm": 0.07580243042957731, + "learning_rate": 1.6123659262823498e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.625, + "logps/chosen": -506.0, + "logps/rejected": -700.0, + "loss": 0.0124, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -9.9375, + "rewards/margins": 12.5625, + "rewards/rejected": -22.5, + "step": 5140 + }, + { + "epoch": 2.694924123495552, + "grad_norm": 1.4111733913030347, + "learning_rate": 1.5590121471288106e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.5, + "logps/chosen": -520.0, + "logps/rejected": -688.0, + "loss": 0.0092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.375, + "rewards/margins": 11.1875, + "rewards/rejected": -21.5, + "step": 5150 + }, + { + "epoch": 2.7001569858712715, + "grad_norm": 0.08758065150759772, + "learning_rate": 1.5065276150891788e-06, + "logits/chosen": -12.625, + "logits/rejected": -12.125, + "logps/chosen": -524.0, + "logps/rejected": -748.0, + "loss": 0.0115, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.5625, + "rewards/margins": 13.3125, + "rewards/rejected": -23.875, + "step": 5160 + }, + { + "epoch": 2.705389848246991, + "grad_norm": 0.05585072015141439, + "learning_rate": 1.4549142764136769e-06, + "logits/chosen": -13.1875, + "logits/rejected": -12.6875, + "logps/chosen": -486.0, + "logps/rejected": -732.0, + "loss": 0.0132, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.0, + "rewards/margins": 12.8125, + "rewards/rejected": -22.75, + "step": 5170 + }, + { + "epoch": 2.7106227106227108, + "grad_norm": 2.383493892083904, + "learning_rate": 1.4041740450466385e-06, + "logits/chosen": -13.0625, + "logits/rejected": -13.0625, + "logps/chosen": -544.0, + "logps/rejected": -760.0, + "loss": 0.02, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.8125, + "rewards/margins": 13.0625, + "rewards/rejected": -23.875, + "step": 5180 + }, + { + "epoch": 2.7158555729984304, + "grad_norm": 0.4685409939387069, + "learning_rate": 1.3543088025555095e-06, + "logits/chosen": -12.8125, + "logits/rejected": -12.375, + "logps/chosen": -490.0, + "logps/rejected": -684.0, + "loss": 0.0125, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.1875, + "rewards/margins": 11.5625, + "rewards/rejected": -21.75, + "step": 5190 + }, + { + "epoch": 2.7210884353741496, + "grad_norm": 0.024282596932399626, + "learning_rate": 1.3053203980610746e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.1875, + "logps/chosen": -556.0, + "logps/rejected": -824.0, + "loss": 0.0077, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.5625, + "rewards/margins": 15.4375, + "rewards/rejected": -25.0, + "step": 5200 + }, + { + "epoch": 2.726321297749869, + "grad_norm": 0.08325518642920167, + "learning_rate": 1.2572106481689245e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.5, + "logps/chosen": -474.0, + "logps/rejected": -728.0, + "loss": 0.0157, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.375, + "rewards/margins": 12.4375, + "rewards/rejected": -22.75, + "step": 5210 + }, + { + "epoch": 2.731554160125589, + "grad_norm": 0.48506294422392854, + "learning_rate": 1.2099813369020468e-06, + "logits/chosen": -12.875, + "logits/rejected": -12.6875, + "logps/chosen": -536.0, + "logps/rejected": -780.0, + "loss": 0.0343, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.8125, + "rewards/margins": 13.875, + "rewards/rejected": -23.75, + "step": 5220 + }, + { + "epoch": 2.736787022501308, + "grad_norm": 2.6864742223740325, + "learning_rate": 1.1636342156346846e-06, + "logits/chosen": -12.75, + "logits/rejected": -12.3125, + "logps/chosen": -572.0, + "logps/rejected": -776.0, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.375, + "rewards/margins": 12.0625, + "rewards/rejected": -23.375, + "step": 5230 + }, + { + "epoch": 2.7420198848770276, + "grad_norm": 0.23707346252631317, + "learning_rate": 1.1181710030274046e-06, + "logits/chosen": -13.125, + "logits/rejected": -12.5625, + "logps/chosen": -474.0, + "logps/rejected": -696.0, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.625, + "rewards/margins": 11.5625, + "rewards/rejected": -22.25, + "step": 5240 + }, + { + "epoch": 2.7472527472527473, + "grad_norm": 2.157658970277411, + "learning_rate": 1.073593384963356e-06, + "logits/chosen": -12.6875, + "logits/rejected": -12.375, + "logps/chosen": -544.0, + "logps/rejected": -756.0, + "loss": 0.0191, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.25, + "rewards/margins": 14.125, + "rewards/rejected": -24.375, + "step": 5250 + }, + { + "epoch": 2.752485609628467, + "grad_norm": 0.3102770843154215, + "learning_rate": 1.0299030144857446e-06, + "logits/chosen": -13.3125, + "logits/rejected": -12.75, + "logps/chosen": -490.0, + "logps/rejected": -792.0, + "loss": 0.0104, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.875, + "rewards/margins": 13.125, + "rewards/rejected": -24.0, + "step": 5260 + }, + { + "epoch": 2.7577184720041865, + "grad_norm": 0.07324163406898264, + "learning_rate": 9.871015117365518e-07, + "logits/chosen": -12.6875, + "logits/rejected": -12.5625, + "logps/chosen": -528.0, + "logps/rejected": -728.0, + "loss": 0.0106, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.8125, + "rewards/margins": 12.3125, + "rewards/rejected": -23.125, + "step": 5270 + }, + { + "epoch": 2.7629513343799057, + "grad_norm": 0.06042974721060957, + "learning_rate": 9.451904638964448e-07, + "logits/chosen": -12.8125, + "logits/rejected": -12.4375, + "logps/chosen": -572.0, + "logps/rejected": -764.0, + "loss": 0.0135, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.375, + "rewards/margins": 12.5, + "rewards/rejected": -22.875, + "step": 5280 + }, + { + "epoch": 2.7681841967556253, + "grad_norm": 0.3758142300940554, + "learning_rate": 9.041714251259215e-07, + "logits/chosen": -12.8125, + "logits/rejected": -12.8125, + "logps/chosen": -560.0, + "logps/rejected": -728.0, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.8125, + "rewards/margins": 12.25, + "rewards/rejected": -22.0, + "step": 5290 + }, + { + "epoch": 2.773417059131345, + "grad_norm": 0.0433372475756288, + "learning_rate": 8.640459165076858e-07, + "logits/chosen": -12.625, + "logits/rejected": -12.4375, + "logps/chosen": -500.0, + "logps/rejected": -732.0, + "loss": 0.0135, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.5625, + "rewards/margins": 11.8125, + "rewards/rejected": -22.375, + "step": 5300 + }, + { + "epoch": 2.778649921507064, + "grad_norm": 0.022121147398926958, + "learning_rate": 8.248154259902247e-07, + "logits/chosen": -12.875, + "logits/rejected": -12.6875, + "logps/chosen": -516.0, + "logps/rejected": -684.0, + "loss": 0.0119, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.0625, + "rewards/margins": 11.25, + "rewards/rejected": -21.375, + "step": 5310 + }, + { + "epoch": 2.7838827838827838, + "grad_norm": 0.36018606872198927, + "learning_rate": 7.86481408332651e-07, + "logits/chosen": -12.6875, + "logits/rejected": -12.1875, + "logps/chosen": -496.0, + "logps/rejected": -724.0, + "loss": 0.0143, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.875, + "rewards/margins": 12.5, + "rewards/rejected": -22.375, + "step": 5320 + }, + { + "epoch": 2.7891156462585034, + "grad_norm": 0.2828337088179076, + "learning_rate": 7.490452850507507e-07, + "logits/chosen": -13.1875, + "logits/rejected": -12.8125, + "logps/chosen": -470.0, + "logps/rejected": -648.0, + "loss": 0.0509, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.875, + "rewards/margins": 11.375, + "rewards/rejected": -21.25, + "step": 5330 + }, + { + "epoch": 2.794348508634223, + "grad_norm": 0.1829888250339699, + "learning_rate": 7.125084443642654e-07, + "logits/chosen": -12.75, + "logits/rejected": -12.375, + "logps/chosen": -548.0, + "logps/rejected": -752.0, + "loss": 0.01, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.9375, + "rewards/margins": 13.625, + "rewards/rejected": -23.5, + "step": 5340 + }, + { + "epoch": 2.7995813710099426, + "grad_norm": 0.3036718148331244, + "learning_rate": 6.768722411454154e-07, + "logits/chosen": -13.0625, + "logits/rejected": -12.8125, + "logps/chosen": -540.0, + "logps/rejected": -760.0, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.25, + "rewards/margins": 12.875, + "rewards/rejected": -23.25, + "step": 5350 + }, + { + "epoch": 2.804814233385662, + "grad_norm": 1.8143984030436722, + "learning_rate": 6.421379968686664e-07, + "logits/chosen": -12.8125, + "logits/rejected": -12.5625, + "logps/chosen": -588.0, + "logps/rejected": -792.0, + "loss": 0.0124, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.875, + "rewards/margins": 13.875, + "rewards/rejected": -23.75, + "step": 5360 + }, + { + "epoch": 2.8100470957613815, + "grad_norm": 0.9839252753214983, + "learning_rate": 6.083069995617113e-07, + "logits/chosen": -12.875, + "logits/rejected": -12.6875, + "logps/chosen": -520.0, + "logps/rejected": -712.0, + "loss": 0.0175, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.5, + "rewards/margins": 11.8125, + "rewards/rejected": -22.25, + "step": 5370 + }, + { + "epoch": 2.815279958137101, + "grad_norm": 0.03769369288362027, + "learning_rate": 5.753805037577193e-07, + "logits/chosen": -12.875, + "logits/rejected": -12.5625, + "logps/chosen": -502.0, + "logps/rejected": -732.0, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.125, + "rewards/margins": 12.25, + "rewards/rejected": -22.375, + "step": 5380 + }, + { + "epoch": 2.8205128205128203, + "grad_norm": 1.3710061295371272, + "learning_rate": 5.433597304488114e-07, + "logits/chosen": -12.6875, + "logits/rejected": -12.625, + "logps/chosen": -568.0, + "logps/rejected": -800.0, + "loss": 0.0189, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.5, + "rewards/margins": 14.0, + "rewards/rejected": -23.5, + "step": 5390 + }, + { + "epoch": 2.82574568288854, + "grad_norm": 0.04654503797633383, + "learning_rate": 5.122458670407837e-07, + "logits/chosen": -12.875, + "logits/rejected": -12.5625, + "logps/chosen": -510.0, + "logps/rejected": -716.0, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.9375, + "rewards/margins": 12.3125, + "rewards/rejected": -22.25, + "step": 5400 + }, + { + "epoch": 2.8309785452642595, + "grad_norm": 0.07293005563672031, + "learning_rate": 4.820400673090669e-07, + "logits/chosen": -12.625, + "logits/rejected": -12.375, + "logps/chosen": -592.0, + "logps/rejected": -764.0, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": -11.1875, + "rewards/margins": 11.25, + "rewards/rejected": -22.5, + "step": 5410 + }, + { + "epoch": 2.836211407639979, + "grad_norm": 0.10617523379533803, + "learning_rate": 4.527434513559553e-07, + "logits/chosen": -12.9375, + "logits/rejected": -12.5625, + "logps/chosen": -560.0, + "logps/rejected": -740.0, + "loss": 0.0195, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.0, + "rewards/margins": 12.1875, + "rewards/rejected": -22.125, + "step": 5420 + }, + { + "epoch": 2.8414442700156988, + "grad_norm": 0.9514774703351614, + "learning_rate": 4.2435710556906485e-07, + "logits/chosen": -12.6875, + "logits/rejected": -12.6875, + "logps/chosen": -564.0, + "logps/rejected": -768.0, + "loss": 0.0161, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.9375, + "rewards/margins": 12.5625, + "rewards/rejected": -23.5, + "step": 5430 + }, + { + "epoch": 2.846677132391418, + "grad_norm": 0.10096456261880521, + "learning_rate": 3.968820825810432e-07, + "logits/chosen": -12.875, + "logits/rejected": -12.625, + "logps/chosen": -478.0, + "logps/rejected": -736.0, + "loss": 0.01, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -8.8125, + "rewards/margins": 14.5625, + "rewards/rejected": -23.375, + "step": 5440 + }, + { + "epoch": 2.8519099947671376, + "grad_norm": 0.5066792514792358, + "learning_rate": 3.7031940123053997e-07, + "logits/chosen": -12.875, + "logits/rejected": -12.5, + "logps/chosen": -498.0, + "logps/rejected": -768.0, + "loss": 0.016, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.5625, + "rewards/margins": 12.9375, + "rewards/rejected": -23.5, + "step": 5450 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.9027735586319693, + "learning_rate": 3.4467004652442847e-07, + "logits/chosen": -12.75, + "logits/rejected": -12.625, + "logps/chosen": -532.0, + "logps/rejected": -772.0, + "loss": 0.0198, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.8125, + "rewards/margins": 12.75, + "rewards/rejected": -22.625, + "step": 5460 + }, + { + "epoch": 2.8623757195185764, + "grad_norm": 2.98311443868495, + "learning_rate": 3.1993496960127656e-07, + "logits/chosen": -13.0625, + "logits/rejected": -12.8125, + "logps/chosen": -512.0, + "logps/rejected": -752.0, + "loss": 0.0083, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.4375, + "rewards/margins": 14.3125, + "rewards/rejected": -23.75, + "step": 5470 + }, + { + "epoch": 2.867608581894296, + "grad_norm": 0.10108300143799398, + "learning_rate": 2.961150876960667e-07, + "logits/chosen": -12.8125, + "logits/rejected": -12.4375, + "logps/chosen": -540.0, + "logps/rejected": -820.0, + "loss": 0.01, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -11.0, + "rewards/margins": 13.5, + "rewards/rejected": -24.5, + "step": 5480 + }, + { + "epoch": 2.8728414442700156, + "grad_norm": 0.07157164222596546, + "learning_rate": 2.732112841062034e-07, + "logits/chosen": -12.75, + "logits/rejected": -12.5625, + "logps/chosen": -528.0, + "logps/rejected": -732.0, + "loss": 0.0176, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.5625, + "rewards/margins": 11.9375, + "rewards/rejected": -22.5, + "step": 5490 + }, + { + "epoch": 2.8780743066457353, + "grad_norm": 0.29644122911545273, + "learning_rate": 2.5122440815873725e-07, + "logits/chosen": -12.625, + "logits/rejected": -12.75, + "logps/chosen": -528.0, + "logps/rejected": -728.0, + "loss": 0.0221, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.0625, + "rewards/margins": 13.4375, + "rewards/rejected": -23.5, + "step": 5500 + }, + { + "epoch": 2.883307169021455, + "grad_norm": 0.2008809792059982, + "learning_rate": 2.301552751788838e-07, + "logits/chosen": -12.875, + "logits/rejected": -12.3125, + "logps/chosen": -540.0, + "logps/rejected": -808.0, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.875, + "rewards/margins": 14.3125, + "rewards/rejected": -24.125, + "step": 5510 + }, + { + "epoch": 2.8885400313971745, + "grad_norm": 0.7313163229009243, + "learning_rate": 2.1000466645978435e-07, + "logits/chosen": -13.1875, + "logits/rejected": -12.625, + "logps/chosen": -520.0, + "logps/rejected": -712.0, + "loss": 0.0107, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.875, + "rewards/margins": 11.9375, + "rewards/rejected": -22.75, + "step": 5520 + }, + { + "epoch": 2.8937728937728937, + "grad_norm": 1.6364561725555695, + "learning_rate": 1.907733292335373e-07, + "logits/chosen": -12.125, + "logits/rejected": -12.25, + "logps/chosen": -572.0, + "logps/rejected": -772.0, + "loss": 0.021, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.4375, + "rewards/margins": 13.9375, + "rewards/rejected": -23.375, + "step": 5530 + }, + { + "epoch": 2.8990057561486133, + "grad_norm": 0.035311367474680014, + "learning_rate": 1.7246197664347875e-07, + "logits/chosen": -12.75, + "logits/rejected": -12.25, + "logps/chosen": -536.0, + "logps/rejected": -772.0, + "loss": 0.0278, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.375, + "rewards/margins": 13.125, + "rewards/rejected": -22.5, + "step": 5540 + }, + { + "epoch": 2.904238618524333, + "grad_norm": 4.683425846518621, + "learning_rate": 1.5507128771775347e-07, + "logits/chosen": -12.3125, + "logits/rejected": -12.375, + "logps/chosen": -564.0, + "logps/rejected": -768.0, + "loss": 0.0213, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.375, + "rewards/margins": 12.875, + "rewards/rejected": -23.25, + "step": 5550 + }, + { + "epoch": 2.909471480900052, + "grad_norm": 0.9311464987583402, + "learning_rate": 1.386019073441186e-07, + "logits/chosen": -12.875, + "logits/rejected": -12.5, + "logps/chosen": -564.0, + "logps/rejected": -784.0, + "loss": 0.0266, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.3125, + "rewards/margins": 12.3125, + "rewards/rejected": -22.625, + "step": 5560 + }, + { + "epoch": 2.9147043432757718, + "grad_norm": 1.8171086732245252, + "learning_rate": 1.2305444624604035e-07, + "logits/chosen": -12.8125, + "logits/rejected": -12.875, + "logps/chosen": -560.0, + "logps/rejected": -772.0, + "loss": 0.0123, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.75, + "rewards/margins": 13.1875, + "rewards/rejected": -23.0, + "step": 5570 + }, + { + "epoch": 2.9199372056514914, + "grad_norm": 0.3308245242542842, + "learning_rate": 1.0842948096004835e-07, + "logits/chosen": -13.25, + "logits/rejected": -12.625, + "logps/chosen": -504.0, + "logps/rejected": -764.0, + "loss": 0.0353, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.6875, + "rewards/margins": 13.3125, + "rewards/rejected": -23.0, + "step": 5580 + }, + { + "epoch": 2.925170068027211, + "grad_norm": 7.34462217102031, + "learning_rate": 9.472755381434162e-08, + "logits/chosen": -12.75, + "logits/rejected": -12.6875, + "logps/chosen": -528.0, + "logps/rejected": -684.0, + "loss": 0.0343, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -11.0625, + "rewards/margins": 10.5625, + "rewards/rejected": -21.625, + "step": 5590 + }, + { + "epoch": 2.9304029304029307, + "grad_norm": 0.08522077716129985, + "learning_rate": 8.194917290869908e-08, + "logits/chosen": -13.0, + "logits/rejected": -12.8125, + "logps/chosen": -528.0, + "logps/rejected": -712.0, + "loss": 0.0182, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.4375, + "rewards/margins": 12.125, + "rewards/rejected": -22.625, + "step": 5600 + }, + { + "epoch": 2.93563579277865, + "grad_norm": 0.49829930070504364, + "learning_rate": 7.009481209561686e-08, + "logits/chosen": -13.0, + "logits/rejected": -12.4375, + "logps/chosen": -528.0, + "logps/rejected": -780.0, + "loss": 0.0211, + "rewards/accuracies": 1.0, + "rewards/chosen": -9.625, + "rewards/margins": 15.125, + "rewards/rejected": -24.75, + "step": 5610 + }, + { + "epoch": 2.9408686551543695, + "grad_norm": 0.0896042557685845, + "learning_rate": 5.9164910962758445e-08, + "logits/chosen": -12.8125, + "logits/rejected": -12.5, + "logps/chosen": -544.0, + "logps/rejected": -816.0, + "loss": 0.0149, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.5, + "rewards/margins": 14.125, + "rewards/rejected": -24.625, + "step": 5620 + }, + { + "epoch": 2.946101517530089, + "grad_norm": 0.07557114485550194, + "learning_rate": 4.915987481662887e-08, + "logits/chosen": -13.0, + "logits/rejected": -12.4375, + "logps/chosen": -466.0, + "logps/rejected": -680.0, + "loss": 0.0259, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.5625, + "rewards/margins": 11.6875, + "rewards/rejected": -21.25, + "step": 5630 + }, + { + "epoch": 2.9513343799058083, + "grad_norm": 0.1172664083262538, + "learning_rate": 4.008007466757002e-08, + "logits/chosen": -12.75, + "logits/rejected": -12.625, + "logps/chosen": -572.0, + "logps/rejected": -792.0, + "loss": 0.0151, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -11.1875, + "rewards/margins": 12.75, + "rewards/rejected": -24.0, + "step": 5640 + }, + { + "epoch": 2.956567242281528, + "grad_norm": 0.1298391903154204, + "learning_rate": 3.192584721598002e-08, + "logits/chosen": -12.5625, + "logits/rejected": -12.3125, + "logps/chosen": -506.0, + "logps/rejected": -756.0, + "loss": 0.015, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.3125, + "rewards/margins": 14.0, + "rewards/rejected": -23.25, + "step": 5650 + }, + { + "epoch": 2.9618001046572475, + "grad_norm": 0.7035633454767951, + "learning_rate": 2.4697494839850953e-08, + "logits/chosen": -12.625, + "logits/rejected": -12.5625, + "logps/chosen": -544.0, + "logps/rejected": -756.0, + "loss": 0.0165, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -9.375, + "rewards/margins": 13.625, + "rewards/rejected": -23.0, + "step": 5660 + }, + { + "epoch": 2.967032967032967, + "grad_norm": 0.07862395199572014, + "learning_rate": 1.8395285583530654e-08, + "logits/chosen": -12.625, + "logits/rejected": -12.4375, + "logps/chosen": -572.0, + "logps/rejected": -748.0, + "loss": 0.0183, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -10.4375, + "rewards/margins": 12.0, + "rewards/rejected": -22.375, + "step": 5670 + }, + { + "epoch": 2.9722658294086868, + "grad_norm": 0.05368864657027849, + "learning_rate": 1.3019453147805616e-08, + "logits/chosen": -13.25, + "logits/rejected": -12.5625, + "logps/chosen": -520.0, + "logps/rejected": -736.0, + "loss": 0.0081, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.25, + "rewards/margins": 13.0, + "rewards/rejected": -23.25, + "step": 5680 + }, + { + "epoch": 2.977498691784406, + "grad_norm": 0.11527478161506108, + "learning_rate": 8.570196881216297e-09, + "logits/chosen": -12.8125, + "logits/rejected": -12.5625, + "logps/chosen": -496.0, + "logps/rejected": -772.0, + "loss": 0.0106, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.0, + "rewards/margins": 13.4375, + "rewards/rejected": -23.5, + "step": 5690 + }, + { + "epoch": 2.9827315541601256, + "grad_norm": 0.0555134584567345, + "learning_rate": 5.04768177268522e-09, + "logits/chosen": -12.875, + "logits/rejected": -12.75, + "logps/chosen": -556.0, + "logps/rejected": -796.0, + "loss": 0.0256, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -10.375, + "rewards/margins": 14.125, + "rewards/rejected": -24.375, + "step": 5700 + }, + { + "epoch": 2.987964416535845, + "grad_norm": 0.9590070181791803, + "learning_rate": 2.4520384453746716e-09, + "logits/chosen": -12.875, + "logits/rejected": -12.625, + "logps/chosen": -560.0, + "logps/rejected": -792.0, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -10.25, + "rewards/margins": 13.3125, + "rewards/rejected": -23.5, + "step": 5710 + }, + { + "epoch": 2.9931972789115644, + "grad_norm": 0.05085650982391767, + "learning_rate": 7.833631518627815e-10, + "logits/chosen": -13.0, + "logits/rejected": -12.4375, + "logps/chosen": -512.0, + "logps/rejected": -720.0, + "loss": 0.02, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -10.5625, + "rewards/margins": 11.5625, + "rewards/rejected": -22.125, + "step": 5720 + }, + { + "epoch": 2.998430141287284, + "grad_norm": 0.6365850172475589, + "learning_rate": 4.171777056583004e-11, + "logits/chosen": -12.9375, + "logits/rejected": -12.875, + "logps/chosen": -596.0, + "logps/rejected": -744.0, + "loss": 0.011, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -9.875, + "rewards/margins": 13.4375, + "rewards/rejected": -23.25, + "step": 5730 + }, + { + "epoch": 3.0, + "eval_logits/chosen": -13.0, + "eval_logits/rejected": -13.0, + "eval_logps/chosen": -612.0, + "eval_logps/rejected": -632.0, + "eval_loss": 1.0008906126022339, + "eval_rewards/accuracies": 0.71875, + "eval_rewards/chosen": -14.5625, + "eval_rewards/margins": 2.609375, + "eval_rewards/rejected": -17.125, + "eval_runtime": 46.7518, + "eval_samples_per_second": 42.779, + "eval_steps_per_second": 0.684, + "step": 5733 + }, + { + "epoch": 3.0, + "step": 5733, + "total_flos": 0.0, + "train_loss": 0.2962565040964742, + "train_runtime": 10895.6468, + "train_samples_per_second": 16.833, + "train_steps_per_second": 0.526 + } + ], + "logging_steps": 10, + "max_steps": 5733, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}