diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -2,21 +2,21 @@ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.004311993601557882, + "epoch": 0.023994157944152727, "eval_steps": 14379, - "global_step": 62, + "global_step": 345, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 6.954828389609486e-05, - "grad_norm": 61.5, + "grad_norm": 60.0, "learning_rate": 0.0, - "logits/chosen": -1.8334999084472656, - "logits/rejected": -1.7422282695770264, - "logps/chosen": -453.4984436035156, - "logps/rejected": -261.0067138671875, + "logits/chosen": -2.290891647338867, + "logits/rejected": -2.3694534301757812, + "logps/chosen": -397.068115234375, + "logps/rejected": -309.59063720703125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -26,12 +26,12 @@ }, { "epoch": 0.00013909656779218971, - "grad_norm": 67.5, + "grad_norm": 50.5, "learning_rate": 1.0000000000000002e-06, - "logits/chosen": -2.0693089962005615, - "logits/rejected": -2.312026262283325, - "logps/chosen": -421.627197265625, - "logps/rejected": -344.1189880371094, + "logits/chosen": -2.0520503520965576, + "logits/rejected": -2.226337432861328, + "logps/chosen": -257.00714111328125, + "logps/rejected": -317.8482360839844, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -41,919 +41,5164 @@ }, { "epoch": 0.00020864485168828458, - "grad_norm": 61.5, + "grad_norm": 69.5, "learning_rate": 2.0000000000000003e-06, - "logits/chosen": -1.8676624298095703, - "logits/rejected": -1.7069666385650635, - "logps/chosen": -422.336669921875, - "logps/rejected": -262.6879577636719, - "loss": 0.6793, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.016884081065654755, - "rewards/margins": 0.030364228412508965, - "rewards/rejected": -0.013480148278176785, + "logits/chosen": -2.186892032623291, + "logits/rejected": -2.286593198776245, + "logps/chosen": -600.5238037109375, + "logps/rejected": -332.48199462890625, + "loss": 0.6824, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.014262468554079533, + "rewards/margins": 0.023449212312698364, + "rewards/rejected": -0.009186744689941406, "step": 3 }, { "epoch": 0.00027819313558437943, - "grad_norm": 52.75, + "grad_norm": 59.0, "learning_rate": 3e-06, - "logits/chosen": -1.6365675926208496, - "logits/rejected": -1.5634245872497559, - "logps/chosen": -283.46124267578125, - "logps/rejected": -302.1606750488281, - "loss": 0.69, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": -0.001719474559649825, - "rewards/margins": 0.007113742642104626, - "rewards/rejected": -0.008833218365907669, + "logits/chosen": -2.033118724822998, + "logits/rejected": -2.279475212097168, + "logps/chosen": -352.17205810546875, + "logps/rejected": -338.9101867675781, + "loss": 0.7004, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.004218320827931166, + "rewards/margins": -0.013098020106554031, + "rewards/rejected": 0.008879699744284153, "step": 4 }, { "epoch": 0.0003477414194804743, - "grad_norm": 68.5, + "grad_norm": 52.5, "learning_rate": 4.000000000000001e-06, - "logits/chosen": -1.9600415229797363, - "logits/rejected": -2.058027505874634, - "logps/chosen": -417.2373046875, - "logps/rejected": -372.2300109863281, - "loss": 0.67, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.02957988902926445, - "rewards/margins": 0.04743534326553345, - "rewards/rejected": -0.017855454236268997, + "logits/chosen": -2.0575103759765625, + "logits/rejected": -1.990567684173584, + "logps/chosen": -332.1406555175781, + "logps/rejected": -235.01812744140625, + "loss": 0.6965, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.011217957362532616, + "rewards/margins": -0.005735684186220169, + "rewards/rejected": -0.005482272710651159, "step": 5 }, { "epoch": 0.00041728970337656917, - "grad_norm": 63.75, + "grad_norm": 58.25, "learning_rate": 5e-06, - "logits/chosen": -2.115816593170166, - "logits/rejected": -2.0286741256713867, - "logps/chosen": -491.65545654296875, - "logps/rejected": -379.29766845703125, - "loss": 0.6673, + "logits/chosen": -2.2586324214935303, + "logits/rejected": -2.2471041679382324, + "logps/chosen": -440.27923583984375, + "logps/rejected": -347.769287109375, + "loss": 0.6825, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.04700763523578644, - "rewards/margins": 0.055514104664325714, - "rewards/rejected": -0.008506469428539276, + "rewards/chosen": 0.015004740096628666, + "rewards/margins": 0.023995807394385338, + "rewards/rejected": -0.00899107102304697, "step": 6 }, { "epoch": 0.00048683798727266407, - "grad_norm": 51.75, + "grad_norm": 65.5, "learning_rate": 6e-06, - "logits/chosen": -2.2049877643585205, - "logits/rejected": -2.0260984897613525, - "logps/chosen": -260.26806640625, - "logps/rejected": -223.4867706298828, - "loss": 0.6837, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.031063389033079147, - "rewards/margins": 0.02014824002981186, - "rewards/rejected": 0.010915146209299564, + "logits/chosen": -1.7615412473678589, + "logits/rejected": -2.141603946685791, + "logps/chosen": -409.7236328125, + "logps/rejected": -400.96099853515625, + "loss": 0.6797, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02893630973994732, + "rewards/margins": 0.03066907823085785, + "rewards/rejected": -0.0017327703535556793, "step": 7 }, { "epoch": 0.0005563862711687589, - "grad_norm": 57.75, + "grad_norm": 47.75, "learning_rate": 7e-06, - "logits/chosen": -2.1135902404785156, - "logits/rejected": -2.0760629177093506, - "logps/chosen": -459.1631164550781, - "logps/rejected": -286.584716796875, - "loss": 0.6386, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.14775776863098145, - "rewards/margins": 0.12054676562547684, - "rewards/rejected": 0.02721099928021431, + "logits/chosen": -1.8171985149383545, + "logits/rejected": -2.013758659362793, + "logps/chosen": -301.14990234375, + "logps/rejected": -295.22735595703125, + "loss": 0.6702, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03964056074619293, + "rewards/margins": 0.049403116106987, + "rewards/rejected": -0.009762555360794067, "step": 8 }, { "epoch": 0.0006259345550648538, - "grad_norm": 60.5, + "grad_norm": 51.25, "learning_rate": 8.000000000000001e-06, - "logits/chosen": -1.9743354320526123, - "logits/rejected": -2.1739554405212402, - "logps/chosen": -476.1739501953125, - "logps/rejected": -455.08880615234375, - "loss": 0.6446, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.08817607909440994, - "rewards/margins": 0.11239796876907349, - "rewards/rejected": -0.024221880361437798, + "logits/chosen": -1.8066446781158447, + "logits/rejected": -1.6885275840759277, + "logps/chosen": -326.4176940917969, + "logps/rejected": -231.61349487304688, + "loss": 0.6684, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.020799310877919197, + "rewards/margins": 0.0585421547293663, + "rewards/rejected": -0.03774283453822136, "step": 9 }, { "epoch": 0.0006954828389609487, - "grad_norm": 43.5, + "grad_norm": 48.5, "learning_rate": 9e-06, - "logits/chosen": -1.7726998329162598, - "logits/rejected": -1.722292184829712, - "logps/chosen": -254.57232666015625, - "logps/rejected": -227.73849487304688, - "loss": 0.688, + "logits/chosen": -1.7484585046768188, + "logits/rejected": -2.037735939025879, + "logps/chosen": -409.28668212890625, + "logps/rejected": -277.86724853515625, + "loss": 0.6145, "rewards/accuracies": 0.5, - "rewards/chosen": 0.005527876317501068, - "rewards/margins": 0.020915700122714043, - "rewards/rejected": -0.015387821942567825, + "rewards/chosen": 0.13092058897018433, + "rewards/margins": 0.2132578194141388, + "rewards/rejected": -0.08233723044395447, "step": 10 }, { "epoch": 0.0007650311228570435, - "grad_norm": 44.25, + "grad_norm": 45.75, "learning_rate": 1e-05, - "logits/chosen": -1.874917984008789, - "logits/rejected": -1.6966252326965332, - "logps/chosen": -398.72125244140625, - "logps/rejected": -279.9407958984375, - "loss": 0.5934, + "logits/chosen": -1.969573974609375, + "logits/rejected": -2.109876871109009, + "logps/chosen": -363.9532165527344, + "logps/rejected": -300.8861083984375, + "loss": 0.6662, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.25512638688087463, - "rewards/margins": 0.2751691937446594, - "rewards/rejected": -0.020042819902300835, + "rewards/chosen": 0.025846920907497406, + "rewards/margins": 0.06887838244438171, + "rewards/rejected": -0.043031465262174606, "step": 11 }, { "epoch": 0.0008345794067531383, - "grad_norm": 64.5, + "grad_norm": 40.5, "learning_rate": 9.999999986733346e-06, - "logits/chosen": -1.799363374710083, - "logits/rejected": -2.208996295928955, - "logps/chosen": -454.6282958984375, - "logps/rejected": -420.08782958984375, - "loss": 0.6099, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.2652060091495514, - "rewards/margins": 0.22198839485645294, - "rewards/rejected": 0.04321761801838875, + "logits/chosen": -2.1857051849365234, + "logits/rejected": -2.1949193477630615, + "logps/chosen": -218.37246704101562, + "logps/rejected": -214.57034301757812, + "loss": 0.6653, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.02295926958322525, + "rewards/margins": 0.06356047838926315, + "rewards/rejected": -0.0865197405219078, "step": 12 }, { "epoch": 0.0009041276906492332, - "grad_norm": 49.5, + "grad_norm": 45.5, "learning_rate": 9.999999946933385e-06, - "logits/chosen": -2.147571086883545, - "logits/rejected": -2.1741299629211426, - "logps/chosen": -422.0687255859375, - "logps/rejected": -300.4375, - "loss": 0.5828, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.39246436953544617, - "rewards/margins": 0.3933897912502289, - "rewards/rejected": -0.0009253881871700287, + "logits/chosen": -2.063005208969116, + "logits/rejected": -2.17295241355896, + "logps/chosen": -472.1541748046875, + "logps/rejected": -244.9537811279297, + "loss": 0.5797, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.24895532429218292, + "rewards/margins": 0.35012492537498474, + "rewards/rejected": -0.10116957128047943, "step": 13 }, { "epoch": 0.0009736759745453281, - "grad_norm": 68.5, + "grad_norm": 48.25, "learning_rate": 9.999999880600117e-06, - "logits/chosen": -2.423578977584839, - "logits/rejected": -2.1716604232788086, - "logps/chosen": -444.1280822753906, - "logps/rejected": -315.6869201660156, - "loss": 0.6092, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.22460997104644775, - "rewards/margins": 0.2520143687725067, - "rewards/rejected": -0.02740439586341381, + "logits/chosen": -2.0650248527526855, + "logits/rejected": -2.0925192832946777, + "logps/chosen": -400.0792236328125, + "logps/rejected": -254.74302673339844, + "loss": 0.6093, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.11245658248662949, + "rewards/margins": 0.25989192724227905, + "rewards/rejected": -0.14743533730506897, "step": 14 }, { "epoch": 0.001043224258441423, - "grad_norm": 47.75, + "grad_norm": 46.75, "learning_rate": 9.999999787733541e-06, - "logits/chosen": -2.1961770057678223, - "logits/rejected": -1.9373822212219238, - "logps/chosen": -340.7001953125, - "logps/rejected": -267.83990478515625, - "loss": 0.6142, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.3579849898815155, - "rewards/margins": 0.3051595091819763, - "rewards/rejected": 0.05282551422715187, + "logits/chosen": -2.0637147426605225, + "logits/rejected": -2.276603937149048, + "logps/chosen": -288.91583251953125, + "logps/rejected": -288.9216003417969, + "loss": 0.594, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.26771795749664307, + "rewards/margins": 0.38354071974754333, + "rewards/rejected": -0.11582276225090027, "step": 15 }, { "epoch": 0.0011127725423375177, - "grad_norm": 53.75, + "grad_norm": 58.75, "learning_rate": 9.999999668333659e-06, - "logits/chosen": -2.373560667037964, - "logits/rejected": -2.6588644981384277, - "logps/chosen": -359.1456604003906, - "logps/rejected": -380.36737060546875, - "loss": 0.6152, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.2963922321796417, - "rewards/margins": 0.2529491186141968, - "rewards/rejected": 0.04344310984015465, + "logits/chosen": -2.052729368209839, + "logits/rejected": -2.3459558486938477, + "logps/chosen": -455.4678649902344, + "logps/rejected": -414.88397216796875, + "loss": 0.5196, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.49366331100463867, + "rewards/margins": 0.6734903454780579, + "rewards/rejected": -0.17982706427574158, "step": 16 }, { "epoch": 0.0011823208262336127, - "grad_norm": 42.25, + "grad_norm": 51.5, "learning_rate": 9.99999952240047e-06, - "logits/chosen": -1.798056960105896, - "logits/rejected": -1.9035239219665527, - "logps/chosen": -312.77783203125, - "logps/rejected": -276.3880310058594, - "loss": 0.5541, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.3692951202392578, - "rewards/margins": 0.37497708201408386, - "rewards/rejected": -0.00568196177482605, + "logits/chosen": -1.8702497482299805, + "logits/rejected": -2.0599899291992188, + "logps/chosen": -619.893798828125, + "logps/rejected": -359.8669738769531, + "loss": 0.4655, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.6730003356933594, + "rewards/margins": 0.9043228626251221, + "rewards/rejected": -0.23132255673408508, "step": 17 }, { "epoch": 0.0012518691101297075, - "grad_norm": 45.5, + "grad_norm": 36.0, "learning_rate": 9.999999349933978e-06, - "logits/chosen": -1.939241647720337, - "logits/rejected": -1.8147399425506592, - "logps/chosen": -342.1923828125, - "logps/rejected": -257.8717041015625, - "loss": 0.613, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.24674087762832642, - "rewards/margins": 0.25011828541755676, - "rewards/rejected": -0.0033773984760046005, + "logits/chosen": -1.8048973083496094, + "logits/rejected": -1.7730138301849365, + "logps/chosen": -380.25897216796875, + "logps/rejected": -266.96197509765625, + "loss": 0.5433, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.39052316546440125, + "rewards/margins": 0.5715048909187317, + "rewards/rejected": -0.18098172545433044, "step": 18 }, { "epoch": 0.0013214173940258025, - "grad_norm": 36.75, + "grad_norm": 37.5, "learning_rate": 9.999999150934181e-06, - "logits/chosen": -2.0561864376068115, - "logits/rejected": -2.2967233657836914, - "logps/chosen": -390.05010986328125, - "logps/rejected": -337.4217529296875, - "loss": 0.5116, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.5211931467056274, - "rewards/margins": 0.6085362434387207, - "rewards/rejected": -0.08734303712844849, + "logits/chosen": -1.971044659614563, + "logits/rejected": -2.030426502227783, + "logps/chosen": -310.97601318359375, + "logps/rejected": -321.2936706542969, + "loss": 0.6658, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.05187017470598221, + "rewards/margins": 0.07821687310934067, + "rewards/rejected": -0.13008704781532288, "step": 19 }, { "epoch": 0.0013909656779218973, - "grad_norm": 38.5, + "grad_norm": 48.0, "learning_rate": 9.99999892540108e-06, - "logits/chosen": -1.960367202758789, - "logits/rejected": -2.2729439735412598, - "logps/chosen": -363.2322998046875, - "logps/rejected": -340.948486328125, - "loss": 0.5539, - "rewards/accuracies": 0.7000000476837158, - "rewards/chosen": 0.4272863268852234, - "rewards/margins": 0.5176402926445007, - "rewards/rejected": -0.09035397320985794, + "logits/chosen": -2.0395307540893555, + "logits/rejected": -2.1145153045654297, + "logps/chosen": -360.7062683105469, + "logps/rejected": -232.93731689453125, + "loss": 0.6289, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.13396745920181274, + "rewards/margins": 0.21033848822116852, + "rewards/rejected": -0.07637102156877518, "step": 20 }, { "epoch": 0.001460513961817992, - "grad_norm": 48.5, + "grad_norm": 32.5, "learning_rate": 9.999998673334676e-06, - "logits/chosen": -2.011977195739746, - "logits/rejected": -1.9841499328613281, - "logps/chosen": -411.01007080078125, - "logps/rejected": -319.31280517578125, - "loss": 0.537, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.5278565883636475, - "rewards/margins": 0.6329424381256104, - "rewards/rejected": -0.1050858199596405, + "logits/chosen": -1.5960674285888672, + "logits/rejected": -1.522066593170166, + "logps/chosen": -306.71783447265625, + "logps/rejected": -225.61178588867188, + "loss": 0.5629, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.23635677993297577, + "rewards/margins": 0.415762722492218, + "rewards/rejected": -0.17940592765808105, "step": 21 }, { "epoch": 0.001530062245714087, - "grad_norm": 48.25, + "grad_norm": 42.5, "learning_rate": 9.999998394734974e-06, - "logits/chosen": -2.181118965148926, - "logits/rejected": -2.3463783264160156, - "logps/chosen": -358.7189025878906, - "logps/rejected": -483.25201416015625, - "loss": 0.6464, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.3018140494823456, - "rewards/margins": 0.17182397842407227, - "rewards/rejected": 0.1299901008605957, + "logits/chosen": -2.1824440956115723, + "logits/rejected": -2.3484952449798584, + "logps/chosen": -449.4209899902344, + "logps/rejected": -241.85476684570312, + "loss": 0.6067, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.36920052766799927, + "rewards/margins": 0.40295812487602234, + "rewards/rejected": -0.03375759348273277, "step": 22 }, { "epoch": 0.0015996105296101819, - "grad_norm": 44.25, + "grad_norm": 32.0, "learning_rate": 9.999998089601973e-06, - "logits/chosen": -1.5713448524475098, - "logits/rejected": -1.7370574474334717, - "logps/chosen": -342.9325256347656, - "logps/rejected": -324.1170349121094, - "loss": 0.5884, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.3992369771003723, - "rewards/margins": 0.3364359140396118, - "rewards/rejected": 0.0628010556101799, + "logits/chosen": -1.7668521404266357, + "logits/rejected": -1.9323323965072632, + "logps/chosen": -270.4325866699219, + "logps/rejected": -246.8863525390625, + "loss": 0.4987, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.2003074586391449, + "rewards/margins": 0.6068445444107056, + "rewards/rejected": -0.40653708577156067, "step": 23 }, { "epoch": 0.0016691588135062767, - "grad_norm": 49.0, + "grad_norm": 33.0, "learning_rate": 9.999997757935672e-06, - "logits/chosen": -2.326936721801758, - "logits/rejected": -2.602668523788452, - "logps/chosen": -485.84942626953125, - "logps/rejected": -358.29962158203125, - "loss": 0.4842, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.8100219368934631, - "rewards/margins": 0.9430795907974243, - "rewards/rejected": -0.13305766880512238, + "logits/chosen": -1.9550578594207764, + "logits/rejected": -2.0033726692199707, + "logps/chosen": -320.02288818359375, + "logps/rejected": -309.55316162109375, + "loss": 0.6091, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13415774703025818, + "rewards/margins": 0.33526551723480225, + "rewards/rejected": -0.20110778510570526, "step": 24 }, { "epoch": 0.0017387070974023717, - "grad_norm": 46.75, + "grad_norm": 50.5, "learning_rate": 9.999997399736077e-06, - "logits/chosen": -1.874396562576294, - "logits/rejected": -2.139902114868164, - "logps/chosen": -367.8358154296875, - "logps/rejected": -382.55145263671875, - "loss": 0.5929, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.208806112408638, - "rewards/margins": 0.30954980850219727, - "rewards/rejected": -0.10074371844530106, + "logits/chosen": -2.1274514198303223, + "logits/rejected": -2.2854597568511963, + "logps/chosen": -383.732666015625, + "logps/rejected": -298.32220458984375, + "loss": 0.5277, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46521762013435364, + "rewards/margins": 0.7974266409873962, + "rewards/rejected": -0.332209050655365, "step": 25 }, { "epoch": 0.0018082553812984665, - "grad_norm": 39.5, + "grad_norm": 43.0, "learning_rate": 9.999997015003187e-06, - "logits/chosen": -1.8699212074279785, - "logits/rejected": -1.8992388248443604, - "logps/chosen": -445.5076599121094, - "logps/rejected": -342.38800048828125, - "loss": 0.4884, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.7023246884346008, - "rewards/margins": 0.8794023990631104, - "rewards/rejected": -0.17707765102386475, + "logits/chosen": -2.128185272216797, + "logits/rejected": -2.248570442199707, + "logps/chosen": -391.1646728515625, + "logps/rejected": -266.12884521484375, + "loss": 0.5068, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44852790236473083, + "rewards/margins": 0.6601754426956177, + "rewards/rejected": -0.21164758503437042, "step": 26 }, { "epoch": 0.0018778036651945613, - "grad_norm": 41.5, + "grad_norm": 40.75, "learning_rate": 9.999996603737007e-06, - "logits/chosen": -1.9606945514678955, - "logits/rejected": -1.9597727060317993, - "logps/chosen": -366.387939453125, - "logps/rejected": -338.66534423828125, - "loss": 0.5415, - "rewards/accuracies": 0.7000000476837158, - "rewards/chosen": 0.4197579622268677, - "rewards/margins": 0.430759459733963, - "rewards/rejected": -0.011001495644450188, + "logits/chosen": -1.622370958328247, + "logits/rejected": -1.7693196535110474, + "logps/chosen": -337.97454833984375, + "logps/rejected": -295.5948486328125, + "loss": 0.5931, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.2101454883813858, + "rewards/margins": 0.40968072414398193, + "rewards/rejected": -0.19953517615795135, "step": 27 }, { "epoch": 0.0019473519490906563, - "grad_norm": 37.5, + "grad_norm": 39.5, "learning_rate": 9.999996165937535e-06, - "logits/chosen": -1.8974876403808594, - "logits/rejected": -1.5265250205993652, - "logps/chosen": -304.15374755859375, - "logps/rejected": -204.1177520751953, - "loss": 0.4434, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.8065065145492554, - "rewards/margins": 0.778836727142334, - "rewards/rejected": 0.027669839560985565, + "logits/chosen": -1.895212173461914, + "logits/rejected": -2.0415358543395996, + "logps/chosen": -284.90380859375, + "logps/rejected": -283.76348876953125, + "loss": 0.538, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.32501882314682007, + "rewards/margins": 0.5738599896430969, + "rewards/rejected": -0.24884119629859924, "step": 28 }, { "epoch": 0.002016900232986751, - "grad_norm": 52.0, + "grad_norm": 58.25, "learning_rate": 9.999995701604779e-06, - "logits/chosen": -1.9435765743255615, - "logits/rejected": -2.162288188934326, - "logps/chosen": -258.3497619628906, - "logps/rejected": -323.7847900390625, - "loss": 0.5442, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.2223002314567566, - "rewards/margins": 0.5276059508323669, - "rewards/rejected": -0.30530574917793274, + "logits/chosen": -2.2158336639404297, + "logits/rejected": -2.4598889350891113, + "logps/chosen": -311.6733093261719, + "logps/rejected": -449.92138671875, + "loss": 0.6588, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": -0.036888279020786285, + "rewards/margins": 0.2800174355506897, + "rewards/rejected": -0.3169057369232178, "step": 29 }, { "epoch": 0.002086448516882846, - "grad_norm": 33.0, + "grad_norm": 38.75, "learning_rate": 9.999995210738735e-06, - "logits/chosen": -1.8861711025238037, - "logits/rejected": -2.1225740909576416, - "logps/chosen": -298.01519775390625, - "logps/rejected": -256.1895751953125, - "loss": 0.5607, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.4823571741580963, - "rewards/margins": 0.5623458027839661, - "rewards/rejected": -0.07998859882354736, + "logits/chosen": -2.0205981731414795, + "logits/rejected": -1.9132325649261475, + "logps/chosen": -330.7354736328125, + "logps/rejected": -256.1319580078125, + "loss": 0.5792, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.2625240385532379, + "rewards/margins": 0.4181574881076813, + "rewards/rejected": -0.15563346445560455, "step": 30 }, { "epoch": 0.002155996800778941, - "grad_norm": 40.0, + "grad_norm": 31.125, "learning_rate": 9.99999469333941e-06, - "logits/chosen": -2.166451930999756, - "logits/rejected": -2.055312156677246, - "logps/chosen": -327.69818115234375, - "logps/rejected": -240.5767364501953, - "loss": 0.5968, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.420712411403656, - "rewards/margins": 0.4195905923843384, - "rewards/rejected": 0.0011218097060918808, + "logits/chosen": -2.0172243118286133, + "logits/rejected": -2.0048766136169434, + "logps/chosen": -411.6480712890625, + "logps/rejected": -276.4316711425781, + "loss": 0.4802, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.4169757068157196, + "rewards/margins": 0.7361117005348206, + "rewards/rejected": -0.31913602352142334, "step": 31 }, { "epoch": 0.0022255450846750354, - "grad_norm": 45.75, + "grad_norm": 34.75, "learning_rate": 9.999994149406806e-06, - "logits/chosen": -1.538217306137085, - "logits/rejected": -1.6282873153686523, - "logps/chosen": -339.112060546875, - "logps/rejected": -381.658935546875, - "loss": 0.4853, - "rewards/accuracies": 0.7000000476837158, - "rewards/chosen": 0.630713939666748, - "rewards/margins": 0.9845514297485352, - "rewards/rejected": -0.3538374900817871, + "logits/chosen": -1.7540080547332764, + "logits/rejected": -1.7203712463378906, + "logps/chosen": -326.21844482421875, + "logps/rejected": -256.66680908203125, + "loss": 0.5273, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3268147110939026, + "rewards/margins": 0.530017077922821, + "rewards/rejected": -0.20320232212543488, "step": 32 }, { "epoch": 0.0022950933685711304, - "grad_norm": 53.5, + "grad_norm": 33.75, "learning_rate": 9.999993578940924e-06, - "logits/chosen": -2.2110612392425537, - "logits/rejected": -2.4202797412872314, - "logps/chosen": -339.7915954589844, - "logps/rejected": -298.7029113769531, - "loss": 0.6482, - "rewards/accuracies": 0.44999998807907104, - "rewards/chosen": 0.4733724296092987, - "rewards/margins": 0.4534761905670166, - "rewards/rejected": 0.01989627629518509, + "logits/chosen": -1.7028422355651855, + "logits/rejected": -1.544898509979248, + "logps/chosen": -395.32879638671875, + "logps/rejected": -336.3634948730469, + "loss": 0.4242, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7405567169189453, + "rewards/margins": 1.1268622875213623, + "rewards/rejected": -0.3863055408000946, "step": 33 }, { "epoch": 0.0023646416524672254, - "grad_norm": 29.125, + "grad_norm": 40.75, "learning_rate": 9.99999298194177e-06, - "logits/chosen": -1.4149713516235352, - "logits/rejected": -1.5838656425476074, - "logps/chosen": -293.2918395996094, - "logps/rejected": -277.6675109863281, - "loss": 0.398, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.837464451789856, - "rewards/margins": 1.1750701665878296, - "rewards/rejected": -0.3376057744026184, + "logits/chosen": -2.059645652770996, + "logits/rejected": -2.0634074211120605, + "logps/chosen": -505.163330078125, + "logps/rejected": -404.1461486816406, + "loss": 0.4785, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.945605993270874, + "rewards/margins": 1.1153727769851685, + "rewards/rejected": -0.16976679861545563, "step": 34 }, { "epoch": 0.0024341899363633204, - "grad_norm": 38.25, + "grad_norm": 47.75, "learning_rate": 9.999992358409344e-06, - "logits/chosen": -2.049433469772339, - "logits/rejected": -2.023080825805664, - "logps/chosen": -310.4878845214844, - "logps/rejected": -383.2558898925781, - "loss": 0.4845, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.4835648834705353, - "rewards/margins": 0.8006283640861511, - "rewards/rejected": -0.31706345081329346, + "logits/chosen": -2.4186923503875732, + "logits/rejected": -2.5858025550842285, + "logps/chosen": -395.63726806640625, + "logps/rejected": -365.5321960449219, + "loss": 0.4765, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.6087393760681152, + "rewards/margins": 0.9764730930328369, + "rewards/rejected": -0.36773380637168884, "step": 35 }, { "epoch": 0.002503738220259415, - "grad_norm": 30.5, + "grad_norm": 54.25, "learning_rate": 9.999991708343653e-06, - "logits/chosen": -2.1499810218811035, - "logits/rejected": -1.922285556793213, - "logps/chosen": -271.944091796875, - "logps/rejected": -243.01390075683594, - "loss": 0.6278, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.27908122539520264, - "rewards/margins": 0.35932162404060364, - "rewards/rejected": -0.08024037629365921, + "logits/chosen": -2.3518059253692627, + "logits/rejected": -2.709664821624756, + "logps/chosen": -316.47528076171875, + "logps/rejected": -436.5918273925781, + "loss": 0.5137, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5632300972938538, + "rewards/margins": 0.7559100389480591, + "rewards/rejected": -0.19267994165420532, "step": 36 }, { "epoch": 0.00257328650415551, - "grad_norm": 36.25, + "grad_norm": 46.25, "learning_rate": 9.999991031744697e-06, - "logits/chosen": -1.8825230598449707, - "logits/rejected": -2.1492137908935547, - "logps/chosen": -377.55389404296875, - "logps/rejected": -315.49395751953125, - "loss": 0.4845, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.6905852556228638, - "rewards/margins": 0.961159348487854, - "rewards/rejected": -0.27057409286499023, + "logits/chosen": -2.456254482269287, + "logits/rejected": -2.556328773498535, + "logps/chosen": -525.0447998046875, + "logps/rejected": -434.83319091796875, + "loss": 0.4827, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.912708044052124, + "rewards/margins": 1.1608047485351562, + "rewards/rejected": -0.2480967789888382, "step": 37 }, { "epoch": 0.002642834788051605, - "grad_norm": 34.25, + "grad_norm": 50.0, "learning_rate": 9.999990328612482e-06, - "logits/chosen": -1.8724392652511597, - "logits/rejected": -2.0674619674682617, - "logps/chosen": -408.82843017578125, - "logps/rejected": -333.91925048828125, - "loss": 0.4625, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.7234601974487305, - "rewards/margins": 1.1639423370361328, - "rewards/rejected": -0.44048213958740234, + "logits/chosen": -2.2290701866149902, + "logits/rejected": -2.331836223602295, + "logps/chosen": -434.9827880859375, + "logps/rejected": -366.12591552734375, + "loss": 0.479, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8116751909255981, + "rewards/margins": 1.0686509609222412, + "rewards/rejected": -0.2569756507873535, "step": 38 }, { "epoch": 0.0027123830719476996, - "grad_norm": 37.75, + "grad_norm": 27.875, "learning_rate": 9.999989598947008e-06, - "logits/chosen": -1.8963747024536133, - "logits/rejected": -1.8232009410858154, - "logps/chosen": -336.85894775390625, - "logps/rejected": -242.29061889648438, - "loss": 0.5546, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.44201433658599854, - "rewards/margins": 0.6604232788085938, - "rewards/rejected": -0.21840900182724, + "logits/chosen": -1.5525405406951904, + "logits/rejected": -2.134503126144409, + "logps/chosen": -335.6802673339844, + "logps/rejected": -382.2537841796875, + "loss": 0.4817, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7176051735877991, + "rewards/margins": 1.057665228843689, + "rewards/rejected": -0.34006014466285706, "step": 39 }, { "epoch": 0.0027819313558437946, - "grad_norm": 36.25, + "grad_norm": 46.5, "learning_rate": 9.999988842748285e-06, - "logits/chosen": -2.259315013885498, - "logits/rejected": -2.522401809692383, - "logps/chosen": -330.76104736328125, - "logps/rejected": -311.228271484375, - "loss": 0.5086, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.6043640971183777, - "rewards/margins": 0.7918596267700195, - "rewards/rejected": -0.18749551475048065, + "logits/chosen": -2.0863614082336426, + "logits/rejected": -2.0769240856170654, + "logps/chosen": -436.5563659667969, + "logps/rejected": -304.5190734863281, + "loss": 0.4402, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.9865360856056213, + "rewards/margins": 1.1425951719284058, + "rewards/rejected": -0.15605907142162323, "step": 40 }, { "epoch": 0.0028514796397398896, - "grad_norm": 49.75, + "grad_norm": 41.75, "learning_rate": 9.99998806001631e-06, - "logits/chosen": -1.7490835189819336, - "logits/rejected": -1.6037665605545044, - "logps/chosen": -316.95513916015625, - "logps/rejected": -196.12319946289062, - "loss": 0.5071, - "rewards/accuracies": 0.7000000476837158, - "rewards/chosen": 0.6437100172042847, - "rewards/margins": 0.7891340255737305, - "rewards/rejected": -0.14542406797409058, + "logits/chosen": -1.7388699054718018, + "logits/rejected": -2.2738285064697266, + "logps/chosen": -216.63003540039062, + "logps/rejected": -324.6629333496094, + "loss": 0.5432, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.5351641178131104, + "rewards/margins": 0.9341287016868591, + "rewards/rejected": -0.39896470308303833, "step": 41 }, { "epoch": 0.002921027923635984, - "grad_norm": 64.0, + "grad_norm": 29.25, "learning_rate": 9.999987250751094e-06, - "logits/chosen": -2.132702350616455, - "logits/rejected": -2.355300188064575, - "logps/chosen": -449.4794921875, - "logps/rejected": -484.9307861328125, - "loss": 0.5231, - "rewards/accuracies": 0.7000000476837158, - "rewards/chosen": 0.6490827798843384, - "rewards/margins": 1.0969290733337402, - "rewards/rejected": -0.447846382856369, + "logits/chosen": -1.789241909980774, + "logits/rejected": -2.0647335052490234, + "logps/chosen": -359.7667236328125, + "logps/rejected": -298.1225891113281, + "loss": 0.5015, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8949040174484253, + "rewards/margins": 1.1206433773040771, + "rewards/rejected": -0.22573941946029663, "step": 42 }, { "epoch": 0.002990576207532079, - "grad_norm": 38.0, + "grad_norm": 39.75, "learning_rate": 9.999986414952638e-06, - "logits/chosen": -2.0872957706451416, - "logits/rejected": -2.217331647872925, - "logps/chosen": -469.4587707519531, - "logps/rejected": -405.3837890625, - "loss": 0.4306, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 1.0562833547592163, - "rewards/margins": 1.2830203771591187, - "rewards/rejected": -0.22673699259757996, + "logits/chosen": -1.7696220874786377, + "logits/rejected": -1.848757266998291, + "logps/chosen": -376.5746765136719, + "logps/rejected": -298.95928955078125, + "loss": 0.4704, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 1.0613232851028442, + "rewards/margins": 0.9406245350837708, + "rewards/rejected": 0.12069880217313766, "step": 43 }, { "epoch": 0.003060124491428174, - "grad_norm": 30.25, + "grad_norm": 45.75, "learning_rate": 9.999985552620944e-06, - "logits/chosen": -1.765653371810913, - "logits/rejected": -2.0210206508636475, - "logps/chosen": -344.2925720214844, - "logps/rejected": -326.77099609375, - "loss": 0.4337, - "rewards/accuracies": 0.75, - "rewards/chosen": 1.1038718223571777, - "rewards/margins": 1.2277381420135498, - "rewards/rejected": -0.12386628985404968, + "logits/chosen": -1.7304034233093262, + "logits/rejected": -2.0260019302368164, + "logps/chosen": -298.7785339355469, + "logps/rejected": -440.1461181640625, + "loss": 0.5538, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.4868564009666443, + "rewards/margins": 0.732475221157074, + "rewards/rejected": -0.24561886489391327, "step": 44 }, { "epoch": 0.0031296727753242688, - "grad_norm": 41.5, + "grad_norm": 36.25, "learning_rate": 9.999984663756021e-06, - "logits/chosen": -2.121791124343872, - "logits/rejected": -2.3115389347076416, - "logps/chosen": -218.78921508789062, - "logps/rejected": -299.87066650390625, - "loss": 0.6018, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.14280036091804504, - "rewards/margins": 0.4419238567352295, - "rewards/rejected": -0.29912346601486206, + "logits/chosen": -2.0583691596984863, + "logits/rejected": -2.061103343963623, + "logps/chosen": -441.5859375, + "logps/rejected": -218.41436767578125, + "loss": 0.4619, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9443624019622803, + "rewards/margins": 1.0102788209915161, + "rewards/rejected": -0.06591643393039703, "step": 45 }, { "epoch": 0.0031992210592203638, - "grad_norm": 59.5, + "grad_norm": 40.0, "learning_rate": 9.99998374835787e-06, - "logits/chosen": -2.3500468730926514, - "logits/rejected": -2.43585205078125, - "logps/chosen": -501.69488525390625, - "logps/rejected": -357.568603515625, - "loss": 0.5061, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.8865605592727661, - "rewards/margins": 0.9040858745574951, - "rewards/rejected": -0.017525240778923035, + "logits/chosen": -1.8798418045043945, + "logits/rejected": -2.0788285732269287, + "logps/chosen": -382.5895690917969, + "logps/rejected": -272.2588806152344, + "loss": 0.5828, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.7576146721839905, + "rewards/margins": 0.743416428565979, + "rewards/rejected": 0.014198275282979012, "step": 46 }, { "epoch": 0.003268769343116459, - "grad_norm": 69.0, + "grad_norm": 41.75, "learning_rate": 9.9999828064265e-06, - "logits/chosen": -2.1080899238586426, - "logits/rejected": -2.2176952362060547, - "logps/chosen": -282.1893005371094, - "logps/rejected": -297.91497802734375, - "loss": 0.6951, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.02138434909284115, - "rewards/margins": 0.1782296597957611, - "rewards/rejected": -0.1568453013896942, + "logits/chosen": -2.262146472930908, + "logits/rejected": -2.21658992767334, + "logps/chosen": -278.13507080078125, + "logps/rejected": -323.96978759765625, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3078311085700989, + "rewards/margins": 0.6501657962799072, + "rewards/rejected": -0.34233468770980835, "step": 47 }, { "epoch": 0.0033383176270125534, - "grad_norm": 41.75, + "grad_norm": 49.5, "learning_rate": 9.999981837961911e-06, - "logits/chosen": -1.7342658042907715, - "logits/rejected": -2.018191337585449, - "logps/chosen": -341.4564208984375, - "logps/rejected": -295.56878662109375, - "loss": 0.569, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.703053891658783, - "rewards/margins": 1.032260537147522, - "rewards/rejected": -0.3292067348957062, + "logits/chosen": -1.8984332084655762, + "logits/rejected": -1.9285097122192383, + "logps/chosen": -297.12908935546875, + "logps/rejected": -273.7828369140625, + "loss": 0.549, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6155263185501099, + "rewards/margins": 0.5832459330558777, + "rewards/rejected": 0.03228042274713516, "step": 48 }, { "epoch": 0.0034078659109086484, - "grad_norm": 66.5, + "grad_norm": 58.25, "learning_rate": 9.999980842964112e-06, - "logits/chosen": -2.1717751026153564, - "logits/rejected": -2.1878905296325684, - "logps/chosen": -369.5394287109375, - "logps/rejected": -301.9841003417969, - "loss": 0.7157, + "logits/chosen": -1.8797364234924316, + "logits/rejected": -2.1005911827087402, + "logps/chosen": -349.6468200683594, + "logps/rejected": -325.0228271484375, + "loss": 0.5917, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.04911491274833679, - "rewards/margins": 0.14859969913959503, - "rewards/rejected": -0.09948478639125824, + "rewards/chosen": 0.722574770450592, + "rewards/margins": 0.9426010847091675, + "rewards/rejected": -0.22002632915973663, "step": 49 }, { "epoch": 0.0034774141948047434, - "grad_norm": 48.75, + "grad_norm": 31.75, "learning_rate": 9.999979821433108e-06, - "logits/chosen": -1.8801887035369873, - "logits/rejected": -1.985775113105774, - "logps/chosen": -313.116455078125, - "logps/rejected": -291.5997009277344, - "loss": 0.6104, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.5421634912490845, - "rewards/margins": 0.42026472091674805, - "rewards/rejected": 0.12189877033233643, + "logits/chosen": -1.588087558746338, + "logits/rejected": -1.6070778369903564, + "logps/chosen": -342.11431884765625, + "logps/rejected": -326.7553405761719, + "loss": 0.4512, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5920666456222534, + "rewards/margins": 0.9584090709686279, + "rewards/rejected": -0.3663424849510193, "step": 50 }, { "epoch": 0.003546962478700838, - "grad_norm": 43.75, + "grad_norm": 45.25, "learning_rate": 9.999978773368901e-06, - "logits/chosen": -1.8641619682312012, - "logits/rejected": -2.0486137866973877, - "logps/chosen": -295.24859619140625, - "logps/rejected": -295.7868347167969, - "loss": 0.5751, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.345305860042572, - "rewards/margins": 0.6953895092010498, - "rewards/rejected": -0.35008370876312256, + "logits/chosen": -2.176466226577759, + "logits/rejected": -2.464456558227539, + "logps/chosen": -370.1783142089844, + "logps/rejected": -408.2772521972656, + "loss": 0.4937, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6067290306091309, + "rewards/margins": 1.181972622871399, + "rewards/rejected": -0.5752436518669128, "step": 51 }, { "epoch": 0.003616510762596933, - "grad_norm": 37.0, + "grad_norm": 40.0, "learning_rate": 9.999977698771501e-06, - "logits/chosen": -1.5793604850769043, - "logits/rejected": -1.7689101696014404, - "logps/chosen": -358.31207275390625, - "logps/rejected": -295.0545654296875, - "loss": 0.381, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 1.3175045251846313, - "rewards/margins": 1.5950672626495361, - "rewards/rejected": -0.2775627374649048, + "logits/chosen": -2.0220046043395996, + "logits/rejected": -2.035100221633911, + "logps/chosen": -456.7289733886719, + "logps/rejected": -314.79364013671875, + "loss": 0.5063, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7426163554191589, + "rewards/margins": 1.0831964015960693, + "rewards/rejected": -0.340580016374588, "step": 52 }, { "epoch": 0.003686059046493028, - "grad_norm": 40.5, + "grad_norm": 52.5, "learning_rate": 9.999976597640911e-06, - "logits/chosen": -2.200587511062622, - "logits/rejected": -2.0280280113220215, - "logps/chosen": -320.01605224609375, - "logps/rejected": -278.0500793457031, - "loss": 0.5268, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.34178099036216736, - "rewards/margins": 0.6551687717437744, - "rewards/rejected": -0.31338778138160706, + "logits/chosen": -1.7118637561798096, + "logits/rejected": -2.05117130279541, + "logps/chosen": -390.32513427734375, + "logps/rejected": -435.59271240234375, + "loss": 0.5453, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6873376369476318, + "rewards/margins": 0.8972060084342957, + "rewards/rejected": -0.20986828207969666, "step": 53 }, { "epoch": 0.0037556073303891225, - "grad_norm": 29.0, + "grad_norm": 43.0, "learning_rate": 9.999975469977138e-06, - "logits/chosen": -1.721149206161499, - "logits/rejected": -2.0656189918518066, - "logps/chosen": -296.03070068359375, - "logps/rejected": -314.07501220703125, - "loss": 0.4937, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.863209068775177, - "rewards/margins": 0.987412691116333, - "rewards/rejected": -0.12420366704463959, + "logits/chosen": -2.2851576805114746, + "logits/rejected": -2.302513360977173, + "logps/chosen": -258.1156005859375, + "logps/rejected": -221.02236938476562, + "loss": 0.5965, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.680258572101593, + "rewards/margins": 0.5080844759941101, + "rewards/rejected": 0.17217415571212769, "step": 54 }, { "epoch": 0.0038251556142852175, - "grad_norm": 39.75, + "grad_norm": 48.5, "learning_rate": 9.999974315780188e-06, - "logits/chosen": -1.9110510349273682, - "logits/rejected": -1.8734188079833984, - "logps/chosen": -435.803955078125, - "logps/rejected": -368.73468017578125, - "loss": 0.3871, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.935944676399231, - "rewards/margins": 1.3189483880996704, - "rewards/rejected": -0.38300377130508423, + "logits/chosen": -1.7847936153411865, + "logits/rejected": -1.7329051494598389, + "logps/chosen": -380.6269836425781, + "logps/rejected": -293.39849853515625, + "loss": 0.5798, + "rewards/accuracies": 0.6500000357627869, + "rewards/chosen": 0.719537079334259, + "rewards/margins": 0.6922656297683716, + "rewards/rejected": 0.02727138251066208, "step": 55 }, { "epoch": 0.0038947038981813125, - "grad_norm": 53.25, + "grad_norm": 38.5, "learning_rate": 9.999973135050064e-06, - "logits/chosen": -2.02524995803833, - "logits/rejected": -1.8700392246246338, - "logps/chosen": -464.5777587890625, - "logps/rejected": -361.63238525390625, - "loss": 0.464, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.9357064962387085, - "rewards/margins": 1.0273995399475098, - "rewards/rejected": -0.09169299900531769, + "logits/chosen": -1.9533004760742188, + "logits/rejected": -2.1731998920440674, + "logps/chosen": -306.71240234375, + "logps/rejected": -251.59255981445312, + "loss": 0.546, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.6329398155212402, + "rewards/margins": 0.6731973886489868, + "rewards/rejected": -0.04025762155652046, "step": 56 }, { "epoch": 0.0039642521820774075, - "grad_norm": 93.5, + "grad_norm": 48.5, "learning_rate": 9.999971927786778e-06, - "logits/chosen": -2.032268762588501, - "logits/rejected": -2.027820587158203, - "logps/chosen": -332.2871398925781, - "logps/rejected": -283.9701843261719, - "loss": 0.5514, + "logits/chosen": -2.147404670715332, + "logits/rejected": -2.0842392444610596, + "logps/chosen": -418.43280029296875, + "logps/rejected": -284.262451171875, + "loss": 0.6069, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.6253789663314819, - "rewards/margins": 0.7813310623168945, - "rewards/rejected": -0.15595212578773499, + "rewards/chosen": 0.5127484798431396, + "rewards/margins": 0.9163163900375366, + "rewards/rejected": -0.4035680294036865, "step": 57 }, { "epoch": 0.004033800465973502, - "grad_norm": 50.75, + "grad_norm": 61.5, "learning_rate": 9.999970693990332e-06, - "logits/chosen": -2.1520893573760986, - "logits/rejected": -2.266171932220459, - "logps/chosen": -370.4326171875, - "logps/rejected": -406.26806640625, - "loss": 0.513, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.5668460130691528, - "rewards/margins": 0.9406030178070068, - "rewards/rejected": -0.37375694513320923, + "logits/chosen": -2.0679097175598145, + "logits/rejected": -2.0026912689208984, + "logps/chosen": -422.0777587890625, + "logps/rejected": -337.8226318359375, + "loss": 0.5848, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.3221449553966522, + "rewards/margins": 0.47046032547950745, + "rewards/rejected": -0.14831538498401642, "step": 58 }, { "epoch": 0.004103348749869597, - "grad_norm": 33.25, + "grad_norm": 20.375, "learning_rate": 9.999969433660734e-06, - "logits/chosen": -1.7560322284698486, - "logits/rejected": -1.6698275804519653, - "logps/chosen": -449.2725830078125, - "logps/rejected": -276.92449951171875, - "loss": 0.4161, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.7296736240386963, - "rewards/margins": 1.052181601524353, - "rewards/rejected": -0.3225080072879791, + "logits/chosen": -1.7224586009979248, + "logits/rejected": -1.709118127822876, + "logps/chosen": -390.605712890625, + "logps/rejected": -306.4034729003906, + "loss": 0.4294, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7527649402618408, + "rewards/margins": 1.0507557392120361, + "rewards/rejected": -0.2979908585548401, "step": 59 }, { "epoch": 0.004172897033765692, - "grad_norm": 37.5, + "grad_norm": 37.25, "learning_rate": 9.99996814679799e-06, - "logits/chosen": -1.7677528858184814, - "logits/rejected": -1.7541583776474, - "logps/chosen": -355.5475158691406, - "logps/rejected": -272.10601806640625, - "loss": 0.5261, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.7593498826026917, - "rewards/margins": 0.7893234491348267, - "rewards/rejected": -0.029973585158586502, + "logits/chosen": -2.0240750312805176, + "logits/rejected": -2.1049044132232666, + "logps/chosen": -474.79058837890625, + "logps/rejected": -417.6142883300781, + "loss": 0.5158, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.7541247010231018, + "rewards/margins": 1.0440995693206787, + "rewards/rejected": -0.2899748980998993, "step": 60 }, { "epoch": 0.004242445317661787, - "grad_norm": 38.25, + "grad_norm": 36.0, "learning_rate": 9.999966833402108e-06, - "logits/chosen": -2.5125160217285156, - "logits/rejected": -2.479806423187256, - "logps/chosen": -361.8642272949219, - "logps/rejected": -210.57876586914062, - "loss": 0.5354, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.6074139475822449, - "rewards/margins": 0.7896459698677063, - "rewards/rejected": -0.18223202228546143, + "logits/chosen": -1.6678078174591064, + "logits/rejected": -2.237013816833496, + "logps/chosen": -321.34832763671875, + "logps/rejected": -373.34893798828125, + "loss": 0.524, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.8338617086410522, + "rewards/margins": 0.9398175477981567, + "rewards/rejected": -0.10595586150884628, "step": 61 }, { "epoch": 0.004311993601557882, - "grad_norm": 47.0, + "grad_norm": 57.25, "learning_rate": 9.999965493473094e-06, - "logits/chosen": -2.018270969390869, - "logits/rejected": -1.872260332107544, - "logps/chosen": -362.58184814453125, - "logps/rejected": -261.330078125, - "loss": 0.4872, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.7447779178619385, - "rewards/margins": 1.3671720027923584, - "rewards/rejected": -0.6223940253257751, + "logits/chosen": -2.0279343128204346, + "logits/rejected": -2.1152455806732178, + "logps/chosen": -340.81353759765625, + "logps/rejected": -349.94781494140625, + "loss": 0.58, + "rewards/accuracies": 0.6500000357627869, + "rewards/chosen": 0.8443183302879333, + "rewards/margins": 1.0211808681488037, + "rewards/rejected": -0.1768626570701599, "step": 62 }, { - "epoch": 0.004311993601557882, - "eval_logits/chosen": -1.8774892091751099, - "eval_logits/rejected": -1.917432188987732, - "eval_logps/chosen": -362.68548583984375, - "eval_logps/rejected": -294.7613830566406, - "eval_loss": 0.4743961989879608, - "eval_rewards/accuracies": 0.7718254327774048, - "eval_rewards/chosen": 0.8723062872886658, - "eval_rewards/margins": 0.9760147333145142, - "eval_rewards/rejected": -0.10370844602584839, - "eval_runtime": 62.5506, - "eval_samples_per_second": 2.654, - "eval_steps_per_second": 0.336, - "step": 62 + "epoch": 0.004381541885453977, + "grad_norm": 51.0, + "learning_rate": 9.999964127010956e-06, + "logits/chosen": -2.2341127395629883, + "logits/rejected": -1.9947443008422852, + "logps/chosen": -288.8426208496094, + "logps/rejected": -316.2739562988281, + "loss": 0.5977, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.45409274101257324, + "rewards/margins": 0.3700726628303528, + "rewards/rejected": 0.08402010053396225, + "step": 63 + }, + { + "epoch": 0.004451090169350071, + "grad_norm": 45.5, + "learning_rate": 9.999962734015701e-06, + "logits/chosen": -2.200594902038574, + "logits/rejected": -2.273937225341797, + "logps/chosen": -298.3979187011719, + "logps/rejected": -271.6072998046875, + "loss": 0.6234, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.3760390877723694, + "rewards/margins": 0.3377527594566345, + "rewards/rejected": 0.03828636556863785, + "step": 64 + }, + { + "epoch": 0.004520638453246166, + "grad_norm": 46.75, + "learning_rate": 9.999961314487337e-06, + "logits/chosen": -2.1463193893432617, + "logits/rejected": -2.2315433025360107, + "logps/chosen": -343.3995361328125, + "logps/rejected": -329.48431396484375, + "loss": 0.4696, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.7115351557731628, + "rewards/margins": 0.9584488868713379, + "rewards/rejected": -0.24691380560398102, + "step": 65 + }, + { + "epoch": 0.004590186737142261, + "grad_norm": 33.0, + "learning_rate": 9.999959868425867e-06, + "logits/chosen": -1.9693689346313477, + "logits/rejected": -2.1667985916137695, + "logps/chosen": -377.3485107421875, + "logps/rejected": -192.46490478515625, + "loss": 0.4211, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.2943639755249023, + "rewards/margins": 1.305895209312439, + "rewards/rejected": -0.011531181633472443, + "step": 66 + }, + { + "epoch": 0.004659735021038356, + "grad_norm": 30.0, + "learning_rate": 9.999958395831306e-06, + "logits/chosen": -1.6921086311340332, + "logits/rejected": -1.845644474029541, + "logps/chosen": -311.9151611328125, + "logps/rejected": -357.7615966796875, + "loss": 0.4659, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9061014652252197, + "rewards/margins": 1.5061771869659424, + "rewards/rejected": -0.6000758409500122, + "step": 67 + }, + { + "epoch": 0.004729283304934451, + "grad_norm": 50.0, + "learning_rate": 9.999956896703658e-06, + "logits/chosen": -2.3444316387176514, + "logits/rejected": -2.4198198318481445, + "logps/chosen": -332.48724365234375, + "logps/rejected": -410.49481201171875, + "loss": 0.5545, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6607309579849243, + "rewards/margins": 0.7203662991523743, + "rewards/rejected": -0.059635356068611145, + "step": 68 + }, + { + "epoch": 0.004798831588830546, + "grad_norm": 34.75, + "learning_rate": 9.99995537104293e-06, + "logits/chosen": -1.9875808954238892, + "logits/rejected": -2.2451534271240234, + "logps/chosen": -342.03509521484375, + "logps/rejected": -474.98248291015625, + "loss": 0.4456, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.9518793821334839, + "rewards/margins": 1.3824365139007568, + "rewards/rejected": -0.4305571913719177, + "step": 69 + }, + { + "epoch": 0.004868379872726641, + "grad_norm": 60.5, + "learning_rate": 9.99995381884913e-06, + "logits/chosen": -2.212440252304077, + "logits/rejected": -2.180255889892578, + "logps/chosen": -298.6424865722656, + "logps/rejected": -309.560791015625, + "loss": 0.6093, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.6185494661331177, + "rewards/margins": 0.5332748293876648, + "rewards/rejected": 0.0852745994925499, + "step": 70 + }, + { + "epoch": 0.004937928156622735, + "grad_norm": 79.0, + "learning_rate": 9.999952240122268e-06, + "logits/chosen": -1.9881188869476318, + "logits/rejected": -1.9586936235427856, + "logps/chosen": -456.666259765625, + "logps/rejected": -367.33819580078125, + "loss": 0.7016, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.6617251634597778, + "rewards/margins": 0.47897040843963623, + "rewards/rejected": 0.18275481462478638, + "step": 71 + }, + { + "epoch": 0.00500747644051883, + "grad_norm": 37.25, + "learning_rate": 9.999950634862352e-06, + "logits/chosen": -2.0069074630737305, + "logits/rejected": -1.9764200448989868, + "logps/chosen": -338.65863037109375, + "logps/rejected": -283.73736572265625, + "loss": 0.5184, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.8042025566101074, + "rewards/margins": 0.9190871119499207, + "rewards/rejected": -0.11488458514213562, + "step": 72 + }, + { + "epoch": 0.005077024724414925, + "grad_norm": 47.75, + "learning_rate": 9.999949003069391e-06, + "logits/chosen": -2.22554349899292, + "logits/rejected": -2.4910221099853516, + "logps/chosen": -365.2167663574219, + "logps/rejected": -344.9111328125, + "loss": 0.495, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.9607729911804199, + "rewards/margins": 1.516304850578308, + "rewards/rejected": -0.5555318593978882, + "step": 73 + }, + { + "epoch": 0.00514657300831102, + "grad_norm": 41.75, + "learning_rate": 9.999947344743393e-06, + "logits/chosen": -1.9830665588378906, + "logits/rejected": -2.189613103866577, + "logps/chosen": -420.7419738769531, + "logps/rejected": -373.775390625, + "loss": 0.4656, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.203183650970459, + "rewards/margins": 1.2276890277862549, + "rewards/rejected": -0.02450525015592575, + "step": 74 + }, + { + "epoch": 0.005216121292207115, + "grad_norm": 23.75, + "learning_rate": 9.999945659884367e-06, + "logits/chosen": -1.8235785961151123, + "logits/rejected": -1.7800045013427734, + "logps/chosen": -465.8687744140625, + "logps/rejected": -269.74273681640625, + "loss": 0.3957, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2553186416625977, + "rewards/margins": 1.6905007362365723, + "rewards/rejected": -0.43518203496932983, + "step": 75 + }, + { + "epoch": 0.00528566957610321, + "grad_norm": 39.25, + "learning_rate": 9.99994394849232e-06, + "logits/chosen": -1.758629560470581, + "logits/rejected": -1.627528429031372, + "logps/chosen": -210.3641357421875, + "logps/rejected": -215.11605834960938, + "loss": 0.5544, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.2734740078449249, + "rewards/margins": 0.6399681568145752, + "rewards/rejected": -0.36649417877197266, + "step": 76 + }, + { + "epoch": 0.005355217859999304, + "grad_norm": 44.5, + "learning_rate": 9.999942210567263e-06, + "logits/chosen": -1.978170394897461, + "logits/rejected": -1.8170394897460938, + "logps/chosen": -202.32974243164062, + "logps/rejected": -217.60516357421875, + "loss": 0.7038, + "rewards/accuracies": 0.45000001788139343, + "rewards/chosen": 0.10598383843898773, + "rewards/margins": 0.040176212787628174, + "rewards/rejected": 0.06580762565135956, + "step": 77 + }, + { + "epoch": 0.005424766143895399, + "grad_norm": 29.125, + "learning_rate": 9.999940446109207e-06, + "logits/chosen": -1.6081864833831787, + "logits/rejected": -1.48508620262146, + "logps/chosen": -246.62945556640625, + "logps/rejected": -188.04678344726562, + "loss": 0.553, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5524815917015076, + "rewards/margins": 0.7552536725997925, + "rewards/rejected": -0.20277206599712372, + "step": 78 + }, + { + "epoch": 0.005494314427791494, + "grad_norm": 33.25, + "learning_rate": 9.99993865511816e-06, + "logits/chosen": -1.9000792503356934, + "logits/rejected": -1.7870067358016968, + "logps/chosen": -355.65625, + "logps/rejected": -166.7996063232422, + "loss": 0.4132, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.8635608553886414, + "rewards/margins": 1.325256109237671, + "rewards/rejected": -0.46169525384902954, + "step": 79 + }, + { + "epoch": 0.005563862711687589, + "grad_norm": 51.5, + "learning_rate": 9.999936837594128e-06, + "logits/chosen": -2.2187037467956543, + "logits/rejected": -2.1405515670776367, + "logps/chosen": -425.263671875, + "logps/rejected": -289.376953125, + "loss": 0.4901, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.8612848520278931, + "rewards/margins": 1.3289462327957153, + "rewards/rejected": -0.4676613509654999, + "step": 80 + }, + { + "epoch": 0.005633410995583684, + "grad_norm": 34.5, + "learning_rate": 9.999934993537126e-06, + "logits/chosen": -1.908984899520874, + "logits/rejected": -2.1726903915405273, + "logps/chosen": -267.4691467285156, + "logps/rejected": -237.4594268798828, + "loss": 0.495, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.48311012983322144, + "rewards/margins": 0.934151291847229, + "rewards/rejected": -0.45104119181632996, + "step": 81 + }, + { + "epoch": 0.005702959279479779, + "grad_norm": 38.75, + "learning_rate": 9.99993312294716e-06, + "logits/chosen": -2.047267436981201, + "logits/rejected": -2.0279362201690674, + "logps/chosen": -325.7288818359375, + "logps/rejected": -330.1964111328125, + "loss": 0.4292, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.34103846549987793, + "rewards/margins": 0.9482351541519165, + "rewards/rejected": -0.6071967482566833, + "step": 82 + }, + { + "epoch": 0.005772507563375873, + "grad_norm": 53.75, + "learning_rate": 9.99993122582424e-06, + "logits/chosen": -2.3280813694000244, + "logits/rejected": -2.225856304168701, + "logps/chosen": -444.6776428222656, + "logps/rejected": -281.98095703125, + "loss": 0.5322, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5960502624511719, + "rewards/margins": 0.9329948425292969, + "rewards/rejected": -0.336944580078125, + "step": 83 + }, + { + "epoch": 0.005842055847271968, + "grad_norm": 22.375, + "learning_rate": 9.99992930216838e-06, + "logits/chosen": -2.2000014781951904, + "logits/rejected": -2.7469096183776855, + "logps/chosen": -401.75653076171875, + "logps/rejected": -363.9603576660156, + "loss": 0.3028, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.9019104242324829, + "rewards/margins": 2.32112979888916, + "rewards/rejected": -1.4192192554473877, + "step": 84 + }, + { + "epoch": 0.005911604131168063, + "grad_norm": 33.5, + "learning_rate": 9.999927351979586e-06, + "logits/chosen": -2.046794891357422, + "logits/rejected": -2.080349922180176, + "logps/chosen": -390.9368896484375, + "logps/rejected": -233.12155151367188, + "loss": 0.4374, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.873176097869873, + "rewards/margins": 1.3320425748825073, + "rewards/rejected": -0.4588664770126343, + "step": 85 + }, + { + "epoch": 0.005981152415064158, + "grad_norm": 41.75, + "learning_rate": 9.99992537525787e-06, + "logits/chosen": -1.906624674797058, + "logits/rejected": -1.8845858573913574, + "logps/chosen": -471.88616943359375, + "logps/rejected": -270.2447204589844, + "loss": 0.4052, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.7500527501106262, + "rewards/margins": 1.539765477180481, + "rewards/rejected": -0.7897127866744995, + "step": 86 + }, + { + "epoch": 0.006050700698960253, + "grad_norm": 35.0, + "learning_rate": 9.999923372003244e-06, + "logits/chosen": -2.1796021461486816, + "logits/rejected": -2.2333788871765137, + "logps/chosen": -350.7332763671875, + "logps/rejected": -300.72259521484375, + "loss": 0.4303, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.4636637270450592, + "rewards/margins": 1.1823756694793701, + "rewards/rejected": -0.7187119722366333, + "step": 87 + }, + { + "epoch": 0.006120248982856348, + "grad_norm": 45.5, + "learning_rate": 9.999921342215715e-06, + "logits/chosen": -2.035468578338623, + "logits/rejected": -2.0087974071502686, + "logps/chosen": -454.08172607421875, + "logps/rejected": -308.653564453125, + "loss": 0.4197, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5552224516868591, + "rewards/margins": 1.4294774532318115, + "rewards/rejected": -0.8742548227310181, + "step": 88 + }, + { + "epoch": 0.0061897972667524425, + "grad_norm": 34.0, + "learning_rate": 9.999919285895296e-06, + "logits/chosen": -2.144392728805542, + "logits/rejected": -2.002211093902588, + "logps/chosen": -353.74407958984375, + "logps/rejected": -237.0970458984375, + "loss": 0.4127, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": 0.40173354744911194, + "rewards/margins": 1.0755343437194824, + "rewards/rejected": -0.6738008856773376, + "step": 89 + }, + { + "epoch": 0.0062593455506485375, + "grad_norm": 32.25, + "learning_rate": 9.999917203041997e-06, + "logits/chosen": -1.8842511177062988, + "logits/rejected": -1.8975095748901367, + "logps/chosen": -361.48992919921875, + "logps/rejected": -335.2019348144531, + "loss": 0.3885, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 0.22361770272254944, + "rewards/margins": 1.4843358993530273, + "rewards/rejected": -1.2607182264328003, + "step": 90 + }, + { + "epoch": 0.0063288938345446326, + "grad_norm": 63.0, + "learning_rate": 9.999915093655832e-06, + "logits/chosen": -2.2185633182525635, + "logits/rejected": -2.292785882949829, + "logps/chosen": -424.12847900390625, + "logps/rejected": -470.9425354003906, + "loss": 0.5733, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.2970457971096039, + "rewards/margins": 0.7393168807029724, + "rewards/rejected": -0.44227099418640137, + "step": 91 + }, + { + "epoch": 0.0063984421184407276, + "grad_norm": 35.75, + "learning_rate": 9.999912957736808e-06, + "logits/chosen": -2.2624685764312744, + "logits/rejected": -2.1837399005889893, + "logps/chosen": -376.39544677734375, + "logps/rejected": -318.3736572265625, + "loss": 0.5032, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.29685017466545105, + "rewards/margins": 0.6483829021453857, + "rewards/rejected": -0.3515327274799347, + "step": 92 + }, + { + "epoch": 0.0064679904023368226, + "grad_norm": 37.0, + "learning_rate": 9.99991079528494e-06, + "logits/chosen": -2.2433626651763916, + "logits/rejected": -2.1510190963745117, + "logps/chosen": -380.9385986328125, + "logps/rejected": -356.2982482910156, + "loss": 0.5504, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.4347189962863922, + "rewards/margins": 0.5394213795661926, + "rewards/rejected": -0.104702427983284, + "step": 93 + }, + { + "epoch": 0.006537538686232918, + "grad_norm": 37.75, + "learning_rate": 9.999908606300237e-06, + "logits/chosen": -1.908036231994629, + "logits/rejected": -1.9087241888046265, + "logps/chosen": -430.4862060546875, + "logps/rejected": -276.7899169921875, + "loss": 0.3813, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.9877005815505981, + "rewards/margins": 1.5036680698394775, + "rewards/rejected": -0.5159673690795898, + "step": 94 + }, + { + "epoch": 0.006607086970129012, + "grad_norm": 43.25, + "learning_rate": 9.99990639078271e-06, + "logits/chosen": -2.236067295074463, + "logits/rejected": -2.3056485652923584, + "logps/chosen": -353.30120849609375, + "logps/rejected": -390.1523742675781, + "loss": 0.4902, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5005747079849243, + "rewards/margins": 1.1459251642227173, + "rewards/rejected": -0.645350456237793, + "step": 95 + }, + { + "epoch": 0.006676635254025107, + "grad_norm": 24.375, + "learning_rate": 9.999904148732373e-06, + "logits/chosen": -2.0515098571777344, + "logits/rejected": -1.787858247756958, + "logps/chosen": -316.6454162597656, + "logps/rejected": -263.90771484375, + "loss": 0.3999, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 0.6280633211135864, + "rewards/margins": 1.3098279237747192, + "rewards/rejected": -0.6817646026611328, + "step": 96 + }, + { + "epoch": 0.006746183537921202, + "grad_norm": 39.75, + "learning_rate": 9.999901880149237e-06, + "logits/chosen": -2.1834206581115723, + "logits/rejected": -2.3126254081726074, + "logps/chosen": -500.41912841796875, + "logps/rejected": -299.807373046875, + "loss": 0.4438, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7941393852233887, + "rewards/margins": 1.3150429725646973, + "rewards/rejected": -0.5209035277366638, + "step": 97 + }, + { + "epoch": 0.006815731821817297, + "grad_norm": 28.5, + "learning_rate": 9.999899585033315e-06, + "logits/chosen": -2.1926941871643066, + "logits/rejected": -2.2364673614501953, + "logps/chosen": -446.4827880859375, + "logps/rejected": -358.57928466796875, + "loss": 0.4403, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 0.6417598724365234, + "rewards/margins": 0.9672858715057373, + "rewards/rejected": -0.32552599906921387, + "step": 98 + }, + { + "epoch": 0.006885280105713392, + "grad_norm": 43.75, + "learning_rate": 9.999897263384616e-06, + "logits/chosen": -1.919767141342163, + "logits/rejected": -1.8511722087860107, + "logps/chosen": -309.0068359375, + "logps/rejected": -279.63580322265625, + "loss": 0.4383, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.2540585696697235, + "rewards/margins": 1.2636094093322754, + "rewards/rejected": -1.0095508098602295, + "step": 99 + }, + { + "epoch": 0.006954828389609487, + "grad_norm": 45.75, + "learning_rate": 9.999894915203156e-06, + "logits/chosen": -1.9976160526275635, + "logits/rejected": -2.0882370471954346, + "logps/chosen": -303.28826904296875, + "logps/rejected": -347.4586486816406, + "loss": 0.455, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.00274793803691864, + "rewards/margins": 1.097497820854187, + "rewards/rejected": -1.1002458333969116, + "step": 100 + }, + { + "epoch": 0.007024376673505581, + "grad_norm": 40.0, + "learning_rate": 9.999892540488947e-06, + "logits/chosen": -1.921734094619751, + "logits/rejected": -1.731722116470337, + "logps/chosen": -496.9654541015625, + "logps/rejected": -365.74005126953125, + "loss": 0.3266, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.1187734603881836, + "rewards/margins": 2.0064687728881836, + "rewards/rejected": -0.8876953721046448, + "step": 101 + }, + { + "epoch": 0.007093924957401676, + "grad_norm": 44.0, + "learning_rate": 9.999890139242e-06, + "logits/chosen": -1.8156334161758423, + "logits/rejected": -1.9902055263519287, + "logps/chosen": -354.43511962890625, + "logps/rejected": -204.51922607421875, + "loss": 0.474, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.3571126461029053, + "rewards/margins": 1.1740868091583252, + "rewards/rejected": -0.8169741630554199, + "step": 102 + }, + { + "epoch": 0.007163473241297771, + "grad_norm": 33.75, + "learning_rate": 9.999887711462329e-06, + "logits/chosen": -2.2824862003326416, + "logits/rejected": -2.4707837104797363, + "logps/chosen": -501.87933349609375, + "logps/rejected": -294.4993591308594, + "loss": 0.4554, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.6692067980766296, + "rewards/margins": 1.196839451789856, + "rewards/rejected": -0.5276325941085815, + "step": 103 + }, + { + "epoch": 0.007233021525193866, + "grad_norm": 35.75, + "learning_rate": 9.999885257149944e-06, + "logits/chosen": -1.8012323379516602, + "logits/rejected": -2.2179512977600098, + "logps/chosen": -274.47711181640625, + "logps/rejected": -254.68896484375, + "loss": 0.449, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3754795491695404, + "rewards/margins": 1.5341426134109497, + "rewards/rejected": -1.158663034439087, + "step": 104 + }, + { + "epoch": 0.007302569809089961, + "grad_norm": 37.75, + "learning_rate": 9.99988277630486e-06, + "logits/chosen": -2.5778496265411377, + "logits/rejected": -2.430804967880249, + "logps/chosen": -426.83392333984375, + "logps/rejected": -333.815673828125, + "loss": 0.4246, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 0.5159269571304321, + "rewards/margins": 0.7792252898216248, + "rewards/rejected": -0.2632983326911926, + "step": 105 + }, + { + "epoch": 0.007372118092986056, + "grad_norm": 40.75, + "learning_rate": 9.999880268927093e-06, + "logits/chosen": -2.3947038650512695, + "logits/rejected": -2.020326852798462, + "logps/chosen": -403.68798828125, + "logps/rejected": -304.28594970703125, + "loss": 0.5859, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.4004051387310028, + "rewards/margins": 0.6247851848602295, + "rewards/rejected": -0.2243800014257431, + "step": 106 + }, + { + "epoch": 0.00744166637688215, + "grad_norm": 29.5, + "learning_rate": 9.999877735016653e-06, + "logits/chosen": -2.052888870239258, + "logits/rejected": -2.0310537815093994, + "logps/chosen": -387.9144592285156, + "logps/rejected": -302.40631103515625, + "loss": 0.4671, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3618411421775818, + "rewards/margins": 1.1706888675689697, + "rewards/rejected": -0.8088477849960327, + "step": 107 + }, + { + "epoch": 0.007511214660778245, + "grad_norm": 28.75, + "learning_rate": 9.999875174573554e-06, + "logits/chosen": -1.842193603515625, + "logits/rejected": -2.067054271697998, + "logps/chosen": -271.19781494140625, + "logps/rejected": -300.96734619140625, + "loss": 0.4087, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": 0.39628249406814575, + "rewards/margins": 1.4589645862579346, + "rewards/rejected": -1.0626819133758545, + "step": 108 + }, + { + "epoch": 0.00758076294467434, + "grad_norm": 41.25, + "learning_rate": 9.99987258759781e-06, + "logits/chosen": -1.9016921520233154, + "logits/rejected": -1.9969903230667114, + "logps/chosen": -286.58477783203125, + "logps/rejected": -212.75387573242188, + "loss": 0.5315, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.27487462759017944, + "rewards/margins": 0.7188858985900879, + "rewards/rejected": -0.44401127099990845, + "step": 109 + }, + { + "epoch": 0.007650311228570435, + "grad_norm": 29.875, + "learning_rate": 9.999869974089433e-06, + "logits/chosen": -1.8320720195770264, + "logits/rejected": -1.7135732173919678, + "logps/chosen": -360.64227294921875, + "logps/rejected": -280.5082702636719, + "loss": 0.3431, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 0.8838325142860413, + "rewards/margins": 1.7671468257904053, + "rewards/rejected": -0.8833141922950745, + "step": 110 + }, + { + "epoch": 0.00771985951246653, + "grad_norm": 39.0, + "learning_rate": 9.999867334048441e-06, + "logits/chosen": -1.9485034942626953, + "logits/rejected": -2.0538601875305176, + "logps/chosen": -342.5113830566406, + "logps/rejected": -260.21612548828125, + "loss": 0.4727, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.8435735106468201, + "rewards/margins": 1.017899513244629, + "rewards/rejected": -0.17432589828968048, + "step": 111 + }, + { + "epoch": 0.007789407796362625, + "grad_norm": 54.25, + "learning_rate": 9.999864667474843e-06, + "logits/chosen": -2.595912456512451, + "logits/rejected": -2.5925071239471436, + "logps/chosen": -515.8512573242188, + "logps/rejected": -295.3178405761719, + "loss": 0.5466, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5208666324615479, + "rewards/margins": 0.8142785429954529, + "rewards/rejected": -0.29341191053390503, + "step": 112 + }, + { + "epoch": 0.00785895608025872, + "grad_norm": 32.25, + "learning_rate": 9.999861974368656e-06, + "logits/chosen": -1.6570208072662354, + "logits/rejected": -1.9999960660934448, + "logps/chosen": -246.3365020751953, + "logps/rejected": -419.89129638671875, + "loss": 0.3326, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6707344651222229, + "rewards/margins": 2.1201837062835693, + "rewards/rejected": -1.4494491815567017, + "step": 113 + }, + { + "epoch": 0.007928504364154815, + "grad_norm": 42.0, + "learning_rate": 9.999859254729895e-06, + "logits/chosen": -2.4135072231292725, + "logits/rejected": -2.455704689025879, + "logps/chosen": -396.7958068847656, + "logps/rejected": -275.8202209472656, + "loss": 0.4844, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5708680152893066, + "rewards/margins": 1.4742722511291504, + "rewards/rejected": -0.903404176235199, + "step": 114 + }, + { + "epoch": 0.00799805264805091, + "grad_norm": 28.875, + "learning_rate": 9.999856508558572e-06, + "logits/chosen": -1.7310173511505127, + "logits/rejected": -1.7376844882965088, + "logps/chosen": -353.1683349609375, + "logps/rejected": -306.3093566894531, + "loss": 0.4654, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.4436909258365631, + "rewards/margins": 1.1000614166259766, + "rewards/rejected": -0.6563705801963806, + "step": 115 + }, + { + "epoch": 0.008067600931947003, + "grad_norm": 47.5, + "learning_rate": 9.999853735854703e-06, + "logits/chosen": -1.7987043857574463, + "logits/rejected": -1.692586898803711, + "logps/chosen": -324.2035217285156, + "logps/rejected": -323.4584045410156, + "loss": 0.5876, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.2406245470046997, + "rewards/margins": 0.7598384618759155, + "rewards/rejected": -0.5192137956619263, + "step": 116 + }, + { + "epoch": 0.008137149215843098, + "grad_norm": 36.5, + "learning_rate": 9.999850936618303e-06, + "logits/chosen": -1.8518128395080566, + "logits/rejected": -1.8932234048843384, + "logps/chosen": -416.56915283203125, + "logps/rejected": -277.7508544921875, + "loss": 0.4218, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.9552115201950073, + "rewards/margins": 1.4410091638565063, + "rewards/rejected": -0.485797643661499, + "step": 117 + }, + { + "epoch": 0.008206697499739193, + "grad_norm": 44.75, + "learning_rate": 9.999848110849387e-06, + "logits/chosen": -1.8901162147521973, + "logits/rejected": -1.7480672597885132, + "logps/chosen": -290.180419921875, + "logps/rejected": -295.3495788574219, + "loss": 0.5852, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.20938530564308167, + "rewards/margins": 0.43623602390289307, + "rewards/rejected": -0.6456212997436523, + "step": 118 + }, + { + "epoch": 0.008276245783635288, + "grad_norm": 47.5, + "learning_rate": 9.99984525854797e-06, + "logits/chosen": -1.94680655002594, + "logits/rejected": -1.922938585281372, + "logps/chosen": -403.6405029296875, + "logps/rejected": -377.47088623046875, + "loss": 0.4013, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.6339670419692993, + "rewards/margins": 1.3346670866012573, + "rewards/rejected": -0.700700044631958, + "step": 119 + }, + { + "epoch": 0.008345794067531383, + "grad_norm": 36.0, + "learning_rate": 9.999842379714064e-06, + "logits/chosen": -1.6067575216293335, + "logits/rejected": -1.7978880405426025, + "logps/chosen": -299.720703125, + "logps/rejected": -282.0791015625, + "loss": 0.4496, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.2824891209602356, + "rewards/margins": 1.258902668952942, + "rewards/rejected": -0.9764136075973511, + "step": 120 + }, + { + "epoch": 0.008415342351427478, + "grad_norm": 51.75, + "learning_rate": 9.999839474347689e-06, + "logits/chosen": -1.7490146160125732, + "logits/rejected": -2.0324668884277344, + "logps/chosen": -316.0492858886719, + "logps/rejected": -321.60736083984375, + "loss": 0.4839, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16192859411239624, + "rewards/margins": 0.9611227512359619, + "rewards/rejected": -0.7991940975189209, + "step": 121 + }, + { + "epoch": 0.008484890635323573, + "grad_norm": 33.0, + "learning_rate": 9.999836542448855e-06, + "logits/chosen": -1.6834568977355957, + "logits/rejected": -1.7607324123382568, + "logps/chosen": -391.8604431152344, + "logps/rejected": -392.81640625, + "loss": 0.3048, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7181395888328552, + "rewards/margins": 2.4574594497680664, + "rewards/rejected": -1.739319920539856, + "step": 122 + }, + { + "epoch": 0.008554438919219668, + "grad_norm": 50.25, + "learning_rate": 9.999833584017585e-06, + "logits/chosen": -2.165437698364258, + "logits/rejected": -2.3240346908569336, + "logps/chosen": -387.0223388671875, + "logps/rejected": -408.45538330078125, + "loss": 0.5444, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28136467933654785, + "rewards/margins": 1.216580867767334, + "rewards/rejected": -0.9352161884307861, + "step": 123 + }, + { + "epoch": 0.008623987203115763, + "grad_norm": 44.25, + "learning_rate": 9.999830599053888e-06, + "logits/chosen": -2.068748950958252, + "logits/rejected": -2.469587564468384, + "logps/chosen": -275.2888488769531, + "logps/rejected": -353.27325439453125, + "loss": 0.5777, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.12386937439441681, + "rewards/margins": 0.9148590564727783, + "rewards/rejected": -1.0387284755706787, + "step": 124 + }, + { + "epoch": 0.008693535487011858, + "grad_norm": 30.25, + "learning_rate": 9.999827587557783e-06, + "logits/chosen": -1.8905799388885498, + "logits/rejected": -1.5225300788879395, + "logps/chosen": -392.71258544921875, + "logps/rejected": -236.7716064453125, + "loss": 0.4374, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.11848696321249008, + "rewards/margins": 1.017806053161621, + "rewards/rejected": -0.8993192315101624, + "step": 125 + }, + { + "epoch": 0.008763083770907953, + "grad_norm": 29.375, + "learning_rate": 9.999824549529285e-06, + "logits/chosen": -2.0472726821899414, + "logits/rejected": -2.493684768676758, + "logps/chosen": -332.4390869140625, + "logps/rejected": -407.05029296875, + "loss": 0.389, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0741736888885498, + "rewards/margins": 1.6070003509521484, + "rewards/rejected": -1.5328266620635986, + "step": 126 + }, + { + "epoch": 0.008832632054804048, + "grad_norm": 20.5, + "learning_rate": 9.99982148496841e-06, + "logits/chosen": -1.9628440141677856, + "logits/rejected": -1.911217212677002, + "logps/chosen": -381.03216552734375, + "logps/rejected": -199.31561279296875, + "loss": 0.4224, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.4860251545906067, + "rewards/margins": 1.6638890504837036, + "rewards/rejected": -1.1778638362884521, + "step": 127 + }, + { + "epoch": 0.008902180338700142, + "grad_norm": 40.5, + "learning_rate": 9.999818393875175e-06, + "logits/chosen": -2.021815061569214, + "logits/rejected": -2.246922492980957, + "logps/chosen": -390.8255920410156, + "logps/rejected": -421.0048828125, + "loss": 0.4048, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21156665682792664, + "rewards/margins": 1.8172640800476074, + "rewards/rejected": -1.6056973934173584, + "step": 128 + }, + { + "epoch": 0.008971728622596237, + "grad_norm": 45.0, + "learning_rate": 9.999815276249596e-06, + "logits/chosen": -2.133558750152588, + "logits/rejected": -2.4764270782470703, + "logps/chosen": -338.45654296875, + "logps/rejected": -478.4189453125, + "loss": 0.4588, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.1149824932217598, + "rewards/margins": 1.4291207790374756, + "rewards/rejected": -1.5441033840179443, + "step": 129 + }, + { + "epoch": 0.009041276906492332, + "grad_norm": 47.5, + "learning_rate": 9.999812132091691e-06, + "logits/chosen": -1.995426893234253, + "logits/rejected": -2.177255868911743, + "logps/chosen": -336.8091735839844, + "logps/rejected": -370.2332458496094, + "loss": 0.6003, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1795596182346344, + "rewards/margins": 0.8996149301528931, + "rewards/rejected": -0.720055341720581, + "step": 130 + }, + { + "epoch": 0.009110825190388427, + "grad_norm": 41.0, + "learning_rate": 9.999808961401474e-06, + "logits/chosen": -2.3544397354125977, + "logits/rejected": -2.235128879547119, + "logps/chosen": -312.712158203125, + "logps/rejected": -298.5188293457031, + "loss": 0.4085, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.07409422099590302, + "rewards/margins": 1.5104870796203613, + "rewards/rejected": -1.5845813751220703, + "step": 131 + }, + { + "epoch": 0.009180373474284522, + "grad_norm": 46.25, + "learning_rate": 9.999805764178963e-06, + "logits/chosen": -1.8352596759796143, + "logits/rejected": -1.8144234418869019, + "logps/chosen": -303.17694091796875, + "logps/rejected": -357.70513916015625, + "loss": 0.5284, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5404098629951477, + "rewards/margins": 0.9908088445663452, + "rewards/rejected": -1.5312186479568481, + "step": 132 + }, + { + "epoch": 0.009249921758180617, + "grad_norm": 50.25, + "learning_rate": 9.999802540424175e-06, + "logits/chosen": -1.7251002788543701, + "logits/rejected": -1.7056000232696533, + "logps/chosen": -359.6927795410156, + "logps/rejected": -330.77606201171875, + "loss": 0.5256, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3259652256965637, + "rewards/margins": 1.0743813514709473, + "rewards/rejected": -1.4003466367721558, + "step": 133 + }, + { + "epoch": 0.009319470042076712, + "grad_norm": 41.25, + "learning_rate": 9.999799290137127e-06, + "logits/chosen": -2.1902990341186523, + "logits/rejected": -2.2894632816314697, + "logps/chosen": -396.1300048828125, + "logps/rejected": -272.83782958984375, + "loss": 0.417, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.33982616662979126, + "rewards/margins": 1.3054828643798828, + "rewards/rejected": -1.6453092098236084, + "step": 134 + }, + { + "epoch": 0.009389018325972807, + "grad_norm": 38.25, + "learning_rate": 9.999796013317837e-06, + "logits/chosen": -2.3197972774505615, + "logits/rejected": -2.4950315952301025, + "logps/chosen": -275.70928955078125, + "logps/rejected": -324.5227355957031, + "loss": 0.4875, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.46912163496017456, + "rewards/margins": 1.2943669557571411, + "rewards/rejected": -1.763488531112671, + "step": 135 + }, + { + "epoch": 0.009458566609868902, + "grad_norm": 38.5, + "learning_rate": 9.999792709966323e-06, + "logits/chosen": -1.784104585647583, + "logits/rejected": -2.028597354888916, + "logps/chosen": -286.1109619140625, + "logps/rejected": -255.00253295898438, + "loss": 0.5982, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.622270405292511, + "rewards/margins": 0.5646243095397949, + "rewards/rejected": -1.1868946552276611, + "step": 136 + }, + { + "epoch": 0.009528114893764997, + "grad_norm": 36.0, + "learning_rate": 9.999789380082601e-06, + "logits/chosen": -1.9110618829727173, + "logits/rejected": -1.7683395147323608, + "logps/chosen": -500.4515380859375, + "logps/rejected": -285.3958740234375, + "loss": 0.4236, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.279616117477417, + "rewards/margins": 1.7909634113311768, + "rewards/rejected": -1.5113472938537598, + "step": 137 + }, + { + "epoch": 0.009597663177661092, + "grad_norm": 58.75, + "learning_rate": 9.999786023666687e-06, + "logits/chosen": -1.8615355491638184, + "logits/rejected": -1.8850352764129639, + "logps/chosen": -423.7852783203125, + "logps/rejected": -305.959716796875, + "loss": 0.5586, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.24174700677394867, + "rewards/margins": 1.3934049606323242, + "rewards/rejected": -1.6351518630981445, + "step": 138 + }, + { + "epoch": 0.009667211461557187, + "grad_norm": 43.5, + "learning_rate": 9.999782640718603e-06, + "logits/chosen": -2.266606092453003, + "logits/rejected": -2.5995078086853027, + "logps/chosen": -537.2833251953125, + "logps/rejected": -332.2987060546875, + "loss": 0.3882, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1133798211812973, + "rewards/margins": 1.3790431022644043, + "rewards/rejected": -1.2656632661819458, + "step": 139 + }, + { + "epoch": 0.009736759745453282, + "grad_norm": 34.25, + "learning_rate": 9.999779231238363e-06, + "logits/chosen": -1.8262062072753906, + "logits/rejected": -1.951011300086975, + "logps/chosen": -233.76266479492188, + "logps/rejected": -215.7196044921875, + "loss": 0.4906, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5166194438934326, + "rewards/margins": 0.7855195999145508, + "rewards/rejected": -1.3021390438079834, + "step": 140 + }, + { + "epoch": 0.009806308029349375, + "grad_norm": 42.0, + "learning_rate": 9.999775795225988e-06, + "logits/chosen": -2.148371696472168, + "logits/rejected": -2.141429901123047, + "logps/chosen": -426.51422119140625, + "logps/rejected": -354.72186279296875, + "loss": 0.5116, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.559288740158081, + "rewards/margins": 1.354931116104126, + "rewards/rejected": -0.7956422567367554, + "step": 141 + }, + { + "epoch": 0.00987585631324547, + "grad_norm": 33.25, + "learning_rate": 9.999772332681496e-06, + "logits/chosen": -2.2950315475463867, + "logits/rejected": -2.278259038925171, + "logps/chosen": -299.96356201171875, + "logps/rejected": -244.24217224121094, + "loss": 0.4507, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4661591947078705, + "rewards/margins": 1.5921554565429688, + "rewards/rejected": -2.058314800262451, + "step": 142 + }, + { + "epoch": 0.009945404597141565, + "grad_norm": 39.0, + "learning_rate": 9.999768843604903e-06, + "logits/chosen": -2.099987030029297, + "logits/rejected": -1.9674077033996582, + "logps/chosen": -446.43231201171875, + "logps/rejected": -289.2764892578125, + "loss": 0.437, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.15205225348472595, + "rewards/margins": 1.5049569606781006, + "rewards/rejected": -1.3529045581817627, + "step": 143 + }, + { + "epoch": 0.01001495288103766, + "grad_norm": 48.25, + "learning_rate": 9.99976532799623e-06, + "logits/chosen": -1.855684518814087, + "logits/rejected": -2.1336143016815186, + "logps/chosen": -219.06790161132812, + "logps/rejected": -285.15740966796875, + "loss": 0.5217, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.3650386929512024, + "rewards/margins": 1.0989367961883545, + "rewards/rejected": -1.463975429534912, + "step": 144 + }, + { + "epoch": 0.010084501164933755, + "grad_norm": 34.0, + "learning_rate": 9.999761785855495e-06, + "logits/chosen": -1.987697720527649, + "logits/rejected": -2.13950777053833, + "logps/chosen": -321.8941345214844, + "logps/rejected": -386.0785827636719, + "loss": 0.4052, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10000848770141602, + "rewards/margins": 1.9168283939361572, + "rewards/rejected": -1.8168197870254517, + "step": 145 + }, + { + "epoch": 0.01015404944882985, + "grad_norm": 48.75, + "learning_rate": 9.999758217182716e-06, + "logits/chosen": -2.0804598331451416, + "logits/rejected": -2.3119006156921387, + "logps/chosen": -413.2589111328125, + "logps/rejected": -481.65850830078125, + "loss": 0.4492, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.6128464937210083, + "rewards/margins": 1.834987759590149, + "rewards/rejected": -1.2221413850784302, + "step": 146 + }, + { + "epoch": 0.010223597732725945, + "grad_norm": 48.75, + "learning_rate": 9.999754621977912e-06, + "logits/chosen": -2.2161660194396973, + "logits/rejected": -2.0997447967529297, + "logps/chosen": -341.5295104980469, + "logps/rejected": -431.5401306152344, + "loss": 0.403, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.16010130941867828, + "rewards/margins": 1.3238003253936768, + "rewards/rejected": -1.1636990308761597, + "step": 147 + }, + { + "epoch": 0.01029314601662204, + "grad_norm": 21.25, + "learning_rate": 9.999751000241103e-06, + "logits/chosen": -1.6595348119735718, + "logits/rejected": -1.576436996459961, + "logps/chosen": -279.8513488769531, + "logps/rejected": -268.538330078125, + "loss": 0.3152, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 0.5522175431251526, + "rewards/margins": 1.816031575202942, + "rewards/rejected": -1.2638139724731445, + "step": 148 + }, + { + "epoch": 0.010362694300518135, + "grad_norm": 44.25, + "learning_rate": 9.999747351972307e-06, + "logits/chosen": -1.9582464694976807, + "logits/rejected": -1.925800085067749, + "logps/chosen": -370.12579345703125, + "logps/rejected": -353.4024658203125, + "loss": 0.5107, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.7650132179260254, + "rewards/margins": 1.3018192052841187, + "rewards/rejected": -0.5368059277534485, + "step": 149 + }, + { + "epoch": 0.01043224258441423, + "grad_norm": 45.5, + "learning_rate": 9.999743677171545e-06, + "logits/chosen": -2.308833122253418, + "logits/rejected": -2.3068995475769043, + "logps/chosen": -289.95263671875, + "logps/rejected": -321.85980224609375, + "loss": 0.5182, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1666913479566574, + "rewards/margins": 0.8715847730636597, + "rewards/rejected": -0.7048934102058411, + "step": 150 + }, + { + "epoch": 0.010501790868310325, + "grad_norm": 55.25, + "learning_rate": 9.999739975838835e-06, + "logits/chosen": -1.72349214553833, + "logits/rejected": -2.028937816619873, + "logps/chosen": -317.18927001953125, + "logps/rejected": -428.93682861328125, + "loss": 0.5717, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.20294076204299927, + "rewards/margins": 0.9504521489143372, + "rewards/rejected": -0.7475113868713379, + "step": 151 + }, + { + "epoch": 0.01057133915220642, + "grad_norm": 35.0, + "learning_rate": 9.999736247974196e-06, + "logits/chosen": -2.104125738143921, + "logits/rejected": -1.7851452827453613, + "logps/chosen": -467.6999816894531, + "logps/rejected": -255.7369842529297, + "loss": 0.3315, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": 1.248273253440857, + "rewards/margins": 2.134517192840576, + "rewards/rejected": -0.8862440586090088, + "step": 152 + }, + { + "epoch": 0.010640887436102513, + "grad_norm": 27.375, + "learning_rate": 9.999732493577651e-06, + "logits/chosen": -2.17039155960083, + "logits/rejected": -2.349635124206543, + "logps/chosen": -374.0396728515625, + "logps/rejected": -283.3423767089844, + "loss": 0.2986, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": 1.0817344188690186, + "rewards/margins": 2.157132625579834, + "rewards/rejected": -1.0753982067108154, + "step": 153 + }, + { + "epoch": 0.010710435719998608, + "grad_norm": 40.75, + "learning_rate": 9.999728712649217e-06, + "logits/chosen": -1.9236202239990234, + "logits/rejected": -1.823804259300232, + "logps/chosen": -395.7584228515625, + "logps/rejected": -284.74127197265625, + "loss": 0.5162, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3455967307090759, + "rewards/margins": 0.9116082191467285, + "rewards/rejected": -0.5660114288330078, + "step": 154 + }, + { + "epoch": 0.010779984003894703, + "grad_norm": 34.0, + "learning_rate": 9.999724905188917e-06, + "logits/chosen": -2.1912121772766113, + "logits/rejected": -2.2852718830108643, + "logps/chosen": -404.85797119140625, + "logps/rejected": -315.2767333984375, + "loss": 0.3611, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5890249013900757, + "rewards/margins": 1.857161045074463, + "rewards/rejected": -1.2681360244750977, + "step": 155 + }, + { + "epoch": 0.010849532287790798, + "grad_norm": 43.75, + "learning_rate": 9.999721071196766e-06, + "logits/chosen": -1.7282344102859497, + "logits/rejected": -1.6572694778442383, + "logps/chosen": -383.9149169921875, + "logps/rejected": -323.91650390625, + "loss": 0.4491, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.2261703908443451, + "rewards/margins": 1.3806204795837402, + "rewards/rejected": -1.1544500589370728, + "step": 156 + }, + { + "epoch": 0.010919080571686893, + "grad_norm": 30.5, + "learning_rate": 9.99971721067279e-06, + "logits/chosen": -1.695785641670227, + "logits/rejected": -1.5937681198120117, + "logps/chosen": -357.56768798828125, + "logps/rejected": -210.7926483154297, + "loss": 0.4041, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.531658947467804, + "rewards/margins": 1.7179948091506958, + "rewards/rejected": -1.186335802078247, + "step": 157 + }, + { + "epoch": 0.010988628855582988, + "grad_norm": 57.75, + "learning_rate": 9.999713323617007e-06, + "logits/chosen": -2.111879348754883, + "logits/rejected": -2.273958921432495, + "logps/chosen": -402.31005859375, + "logps/rejected": -441.7332763671875, + "loss": 0.492, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5210748910903931, + "rewards/margins": 1.718184232711792, + "rewards/rejected": -1.1971094608306885, + "step": 158 + }, + { + "epoch": 0.011058177139479083, + "grad_norm": 38.5, + "learning_rate": 9.999709410029435e-06, + "logits/chosen": -1.9984445571899414, + "logits/rejected": -1.739133596420288, + "logps/chosen": -389.82293701171875, + "logps/rejected": -271.44366455078125, + "loss": 0.398, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.054737091064453125, + "rewards/margins": 1.5282342433929443, + "rewards/rejected": -1.5829713344573975, + "step": 159 + }, + { + "epoch": 0.011127725423375178, + "grad_norm": 52.25, + "learning_rate": 9.9997054699101e-06, + "logits/chosen": -2.395916700363159, + "logits/rejected": -2.453493595123291, + "logps/chosen": -418.62213134765625, + "logps/rejected": -328.48834228515625, + "loss": 0.4527, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.10209833085536957, + "rewards/margins": 1.040419340133667, + "rewards/rejected": -0.9383209347724915, + "step": 160 + }, + { + "epoch": 0.011197273707271273, + "grad_norm": 30.25, + "learning_rate": 9.99970150325902e-06, + "logits/chosen": -1.9088315963745117, + "logits/rejected": -2.1194539070129395, + "logps/chosen": -408.445556640625, + "logps/rejected": -388.78216552734375, + "loss": 0.35, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.116747498512268, + "rewards/margins": 2.258065938949585, + "rewards/rejected": -1.141318440437317, + "step": 161 + }, + { + "epoch": 0.011266821991167368, + "grad_norm": 40.0, + "learning_rate": 9.999697510076216e-06, + "logits/chosen": -1.9484000205993652, + "logits/rejected": -1.7765334844589233, + "logps/chosen": -420.71002197265625, + "logps/rejected": -354.52203369140625, + "loss": 0.4004, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.1482105255126953, + "rewards/margins": 1.6778624057769775, + "rewards/rejected": -0.5296517610549927, + "step": 162 + }, + { + "epoch": 0.011336370275063463, + "grad_norm": 45.25, + "learning_rate": 9.999693490361708e-06, + "logits/chosen": -2.013686418533325, + "logits/rejected": -2.2918448448181152, + "logps/chosen": -376.7837829589844, + "logps/rejected": -354.53338623046875, + "loss": 0.4813, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13578952848911285, + "rewards/margins": 1.7332830429077148, + "rewards/rejected": -1.5974936485290527, + "step": 163 + }, + { + "epoch": 0.011405918558959558, + "grad_norm": 41.5, + "learning_rate": 9.99968944411552e-06, + "logits/chosen": -2.0083389282226562, + "logits/rejected": -1.8593366146087646, + "logps/chosen": -353.74560546875, + "logps/rejected": -332.3958435058594, + "loss": 0.4199, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.30136838555336, + "rewards/margins": 1.7655296325683594, + "rewards/rejected": -1.4641612768173218, + "step": 164 + }, + { + "epoch": 0.011475466842855652, + "grad_norm": 31.75, + "learning_rate": 9.999685371337674e-06, + "logits/chosen": -2.117825508117676, + "logits/rejected": -2.178496837615967, + "logps/chosen": -367.3671875, + "logps/rejected": -239.72097778320312, + "loss": 0.4308, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.22402076423168182, + "rewards/margins": 1.2471394538879395, + "rewards/rejected": -1.4711604118347168, + "step": 165 + }, + { + "epoch": 0.011545015126751747, + "grad_norm": 54.5, + "learning_rate": 9.999681272028188e-06, + "logits/chosen": -2.0813117027282715, + "logits/rejected": -2.19061017036438, + "logps/chosen": -395.650390625, + "logps/rejected": -369.11492919921875, + "loss": 0.592, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.47785645723342896, + "rewards/margins": 0.6328763961791992, + "rewards/rejected": -1.1107327938079834, + "step": 166 + }, + { + "epoch": 0.011614563410647842, + "grad_norm": 39.75, + "learning_rate": 9.999677146187088e-06, + "logits/chosen": -1.9994887113571167, + "logits/rejected": -1.9863563776016235, + "logps/chosen": -473.6923828125, + "logps/rejected": -282.004638671875, + "loss": 0.3599, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.6384748220443726, + "rewards/margins": 2.1072275638580322, + "rewards/rejected": -1.4687527418136597, + "step": 167 + }, + { + "epoch": 0.011684111694543937, + "grad_norm": 39.0, + "learning_rate": 9.99967299381439e-06, + "logits/chosen": -2.1873903274536133, + "logits/rejected": -2.282121181488037, + "logps/chosen": -365.04852294921875, + "logps/rejected": -326.897216796875, + "loss": 0.4081, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.03320002555847168, + "rewards/margins": 1.6273894309997559, + "rewards/rejected": -1.5941894054412842, + "step": 168 + }, + { + "epoch": 0.011753659978440032, + "grad_norm": 30.875, + "learning_rate": 9.999668814910123e-06, + "logits/chosen": -1.9716796875, + "logits/rejected": -2.298739194869995, + "logps/chosen": -328.75836181640625, + "logps/rejected": -310.42877197265625, + "loss": 0.5331, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.19528049230575562, + "rewards/margins": 1.1744117736816406, + "rewards/rejected": -1.369692087173462, + "step": 169 + }, + { + "epoch": 0.011823208262336127, + "grad_norm": 43.5, + "learning_rate": 9.999664609474304e-06, + "logits/chosen": -2.1405038833618164, + "logits/rejected": -1.9507406949996948, + "logps/chosen": -381.611328125, + "logps/rejected": -358.01422119140625, + "loss": 0.3973, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3738439679145813, + "rewards/margins": 1.8931117057800293, + "rewards/rejected": -1.5192677974700928, + "step": 170 + }, + { + "epoch": 0.011892756546232222, + "grad_norm": 58.5, + "learning_rate": 9.99966037750696e-06, + "logits/chosen": -1.9784520864486694, + "logits/rejected": -2.233271598815918, + "logps/chosen": -342.8480224609375, + "logps/rejected": -370.0944519042969, + "loss": 0.5207, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.47748875617980957, + "rewards/margins": 1.626946210861206, + "rewards/rejected": -1.1494574546813965, + "step": 171 + }, + { + "epoch": 0.011962304830128317, + "grad_norm": 36.0, + "learning_rate": 9.999656119008107e-06, + "logits/chosen": -1.7746723890304565, + "logits/rejected": -1.948885202407837, + "logps/chosen": -307.2950134277344, + "logps/rejected": -279.87689208984375, + "loss": 0.471, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.07075725495815277, + "rewards/margins": 1.5493837594985962, + "rewards/rejected": -1.6201410293579102, + "step": 172 + }, + { + "epoch": 0.012031853114024412, + "grad_norm": 58.75, + "learning_rate": 9.999651833977772e-06, + "logits/chosen": -2.2835683822631836, + "logits/rejected": -2.2702455520629883, + "logps/chosen": -402.5487060546875, + "logps/rejected": -331.105224609375, + "loss": 0.5329, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.06890052556991577, + "rewards/margins": 1.2599706649780273, + "rewards/rejected": -1.328871250152588, + "step": 173 + }, + { + "epoch": 0.012101401397920507, + "grad_norm": 24.875, + "learning_rate": 9.999647522415978e-06, + "logits/chosen": -2.098814010620117, + "logits/rejected": -2.2233738899230957, + "logps/chosen": -369.877197265625, + "logps/rejected": -460.09039306640625, + "loss": 0.3158, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.14701759815216064, + "rewards/margins": 2.0494160652160645, + "rewards/rejected": -1.9023984670639038, + "step": 174 + }, + { + "epoch": 0.012170949681816602, + "grad_norm": 34.5, + "learning_rate": 9.999643184322747e-06, + "logits/chosen": -2.0852320194244385, + "logits/rejected": -1.7597146034240723, + "logps/chosen": -388.249755859375, + "logps/rejected": -271.8308410644531, + "loss": 0.4311, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.0015751421451568604, + "rewards/margins": 1.5796971321105957, + "rewards/rejected": -1.5781221389770508, + "step": 175 + }, + { + "epoch": 0.012240497965712697, + "grad_norm": 38.75, + "learning_rate": 9.9996388196981e-06, + "logits/chosen": -2.1653785705566406, + "logits/rejected": -2.281691551208496, + "logps/chosen": -364.22119140625, + "logps/rejected": -373.0096435546875, + "loss": 0.4531, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.06015029549598694, + "rewards/margins": 1.4423726797103882, + "rewards/rejected": -1.5025229454040527, + "step": 176 + }, + { + "epoch": 0.01231004624960879, + "grad_norm": 55.25, + "learning_rate": 9.999634428542064e-06, + "logits/chosen": -1.9460824728012085, + "logits/rejected": -2.1573314666748047, + "logps/chosen": -370.415771484375, + "logps/rejected": -410.0446472167969, + "loss": 0.5764, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.7644985914230347, + "rewards/margins": 0.5029374361038208, + "rewards/rejected": -1.267435908317566, + "step": 177 + }, + { + "epoch": 0.012379594533504885, + "grad_norm": 43.5, + "learning_rate": 9.99963001085466e-06, + "logits/chosen": -1.9454140663146973, + "logits/rejected": -2.087858200073242, + "logps/chosen": -332.80810546875, + "logps/rejected": -374.108154296875, + "loss": 0.4115, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.29328009486198425, + "rewards/margins": 1.9443135261535645, + "rewards/rejected": -2.237593650817871, + "step": 178 + }, + { + "epoch": 0.01244914281740098, + "grad_norm": 35.25, + "learning_rate": 9.99962556663591e-06, + "logits/chosen": -2.324028491973877, + "logits/rejected": -2.3427979946136475, + "logps/chosen": -385.565673828125, + "logps/rejected": -367.2955322265625, + "loss": 0.4646, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.27442866563796997, + "rewards/margins": 1.3950212001800537, + "rewards/rejected": -1.6694495677947998, + "step": 179 + }, + { + "epoch": 0.012518691101297075, + "grad_norm": 34.25, + "learning_rate": 9.999621095885843e-06, + "logits/chosen": -2.012193202972412, + "logits/rejected": -1.9504305124282837, + "logps/chosen": -340.6451110839844, + "logps/rejected": -254.3839874267578, + "loss": 0.4334, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.7405756115913391, + "rewards/margins": 0.9877851009368896, + "rewards/rejected": -1.728360652923584, + "step": 180 + }, + { + "epoch": 0.01258823938519317, + "grad_norm": 31.375, + "learning_rate": 9.999616598604476e-06, + "logits/chosen": -1.824182391166687, + "logits/rejected": -1.8830469846725464, + "logps/chosen": -357.8989562988281, + "logps/rejected": -270.982177734375, + "loss": 0.4684, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.12026992440223694, + "rewards/margins": 1.9750096797943115, + "rewards/rejected": -2.0952796936035156, + "step": 181 + }, + { + "epoch": 0.012657787669089265, + "grad_norm": 29.625, + "learning_rate": 9.999612074791838e-06, + "logits/chosen": -1.87381911277771, + "logits/rejected": -2.1587796211242676, + "logps/chosen": -389.4080810546875, + "logps/rejected": -275.0048522949219, + "loss": 0.3827, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.03666511923074722, + "rewards/margins": 1.5998127460479736, + "rewards/rejected": -1.5631475448608398, + "step": 182 + }, + { + "epoch": 0.01272733595298536, + "grad_norm": 34.0, + "learning_rate": 9.99960752444795e-06, + "logits/chosen": -2.1322197914123535, + "logits/rejected": -1.9937628507614136, + "logps/chosen": -289.8118591308594, + "logps/rejected": -219.06021118164062, + "loss": 0.4799, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8451073169708252, + "rewards/margins": 1.063767910003662, + "rewards/rejected": -1.9088752269744873, + "step": 183 + }, + { + "epoch": 0.012796884236881455, + "grad_norm": 51.0, + "learning_rate": 9.999602947572836e-06, + "logits/chosen": -1.8082635402679443, + "logits/rejected": -1.687127709388733, + "logps/chosen": -339.4537048339844, + "logps/rejected": -328.536865234375, + "loss": 0.531, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.3164316415786743, + "rewards/margins": 0.49887335300445557, + "rewards/rejected": -1.8153049945831299, + "step": 184 + }, + { + "epoch": 0.01286643252077755, + "grad_norm": 40.0, + "learning_rate": 9.999598344166523e-06, + "logits/chosen": -1.8754348754882812, + "logits/rejected": -1.798744559288025, + "logps/chosen": -310.56317138671875, + "logps/rejected": -273.77862548828125, + "loss": 0.4734, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.44789618253707886, + "rewards/margins": 1.318938970565796, + "rewards/rejected": -1.7668349742889404, + "step": 185 + }, + { + "epoch": 0.012935980804673645, + "grad_norm": 25.25, + "learning_rate": 9.999593714229032e-06, + "logits/chosen": -1.655369520187378, + "logits/rejected": -1.7022395133972168, + "logps/chosen": -317.2166748046875, + "logps/rejected": -331.00421142578125, + "loss": 0.4238, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": -0.9760753512382507, + "rewards/margins": 1.0166157484054565, + "rewards/rejected": -1.9926912784576416, + "step": 186 + }, + { + "epoch": 0.01300552908856974, + "grad_norm": 55.5, + "learning_rate": 9.999589057760391e-06, + "logits/chosen": -1.8003004789352417, + "logits/rejected": -1.8385249376296997, + "logps/chosen": -311.56170654296875, + "logps/rejected": -359.531494140625, + "loss": 0.6904, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.8379611968994141, + "rewards/margins": 0.8078317046165466, + "rewards/rejected": -1.6457929611206055, + "step": 187 + }, + { + "epoch": 0.013075077372465835, + "grad_norm": 31.625, + "learning_rate": 9.999584374760623e-06, + "logits/chosen": -1.8551075458526611, + "logits/rejected": -1.6892132759094238, + "logps/chosen": -358.7176208496094, + "logps/rejected": -282.032470703125, + "loss": 0.3451, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.2877535820007324, + "rewards/margins": 1.8362914323806763, + "rewards/rejected": -2.124044895172119, + "step": 188 + }, + { + "epoch": 0.013144625656361928, + "grad_norm": 44.5, + "learning_rate": 9.999579665229752e-06, + "logits/chosen": -1.9371087551116943, + "logits/rejected": -2.053175926208496, + "logps/chosen": -354.4956970214844, + "logps/rejected": -279.7811584472656, + "loss": 0.3564, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.11990344524383545, + "rewards/margins": 1.9816598892211914, + "rewards/rejected": -2.1015634536743164, + "step": 189 + }, + { + "epoch": 0.013214173940258023, + "grad_norm": 32.25, + "learning_rate": 9.999574929167805e-06, + "logits/chosen": -1.9260172843933105, + "logits/rejected": -2.0538833141326904, + "logps/chosen": -291.3908386230469, + "logps/rejected": -299.64349365234375, + "loss": 0.352, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1350608468055725, + "rewards/margins": 2.0122268199920654, + "rewards/rejected": -2.147287607192993, + "step": 190 + }, + { + "epoch": 0.013283722224154118, + "grad_norm": 44.25, + "learning_rate": 9.999570166574805e-06, + "logits/chosen": -2.3310365676879883, + "logits/rejected": -2.2655506134033203, + "logps/chosen": -399.37408447265625, + "logps/rejected": -317.6855163574219, + "loss": 0.5267, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.4006359279155731, + "rewards/margins": 1.088661789894104, + "rewards/rejected": -1.489297866821289, + "step": 191 + }, + { + "epoch": 0.013353270508050213, + "grad_norm": 33.25, + "learning_rate": 9.999565377450779e-06, + "logits/chosen": -2.0213544368743896, + "logits/rejected": -2.1548209190368652, + "logps/chosen": -376.2619934082031, + "logps/rejected": -340.0340576171875, + "loss": 0.3673, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14570823311805725, + "rewards/margins": 2.087656021118164, + "rewards/rejected": -1.9419479370117188, + "step": 192 + }, + { + "epoch": 0.013422818791946308, + "grad_norm": 34.5, + "learning_rate": 9.999560561795751e-06, + "logits/chosen": -1.9112006425857544, + "logits/rejected": -1.7986373901367188, + "logps/chosen": -533.656982421875, + "logps/rejected": -227.78250122070312, + "loss": 0.3228, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.023513913154602, + "rewards/margins": 2.779780864715576, + "rewards/rejected": -1.7562668323516846, + "step": 193 + }, + { + "epoch": 0.013492367075842403, + "grad_norm": 47.5, + "learning_rate": 9.999555719609748e-06, + "logits/chosen": -2.237558364868164, + "logits/rejected": -2.139072895050049, + "logps/chosen": -506.278564453125, + "logps/rejected": -317.04962158203125, + "loss": 0.5028, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.06306444108486176, + "rewards/margins": 1.0565602779388428, + "rewards/rejected": -1.1196247339248657, + "step": 194 + }, + { + "epoch": 0.013561915359738498, + "grad_norm": 28.0, + "learning_rate": 9.999550850892795e-06, + "logits/chosen": -2.0732860565185547, + "logits/rejected": -2.1191370487213135, + "logps/chosen": -304.3548583984375, + "logps/rejected": -408.0211181640625, + "loss": 0.3411, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": -0.08858631551265717, + "rewards/margins": 1.8533482551574707, + "rewards/rejected": -1.941934585571289, + "step": 195 + }, + { + "epoch": 0.013631463643634593, + "grad_norm": 39.0, + "learning_rate": 9.999545955644919e-06, + "logits/chosen": -2.205195665359497, + "logits/rejected": -2.707919120788574, + "logps/chosen": -369.6107482910156, + "logps/rejected": -427.26470947265625, + "loss": 0.458, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.0987672433257103, + "rewards/margins": 1.4780166149139404, + "rewards/rejected": -1.5767837762832642, + "step": 196 + }, + { + "epoch": 0.013701011927530688, + "grad_norm": 30.25, + "learning_rate": 9.999541033866143e-06, + "logits/chosen": -1.5711662769317627, + "logits/rejected": -1.577465295791626, + "logps/chosen": -324.84283447265625, + "logps/rejected": -318.2061767578125, + "loss": 0.4861, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.21440428495407104, + "rewards/margins": 1.6551949977874756, + "rewards/rejected": -1.4407908916473389, + "step": 197 + }, + { + "epoch": 0.013770560211426783, + "grad_norm": 55.0, + "learning_rate": 9.999536085556496e-06, + "logits/chosen": -1.672816276550293, + "logits/rejected": -1.76412034034729, + "logps/chosen": -396.17620849609375, + "logps/rejected": -353.64697265625, + "loss": 0.488, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.02332267165184021, + "rewards/margins": 0.8866921663284302, + "rewards/rejected": -0.8633695244789124, + "step": 198 + }, + { + "epoch": 0.013840108495322878, + "grad_norm": 44.0, + "learning_rate": 9.999531110716001e-06, + "logits/chosen": -2.1796417236328125, + "logits/rejected": -2.1057252883911133, + "logps/chosen": -621.5107421875, + "logps/rejected": -505.1883544921875, + "loss": 0.3719, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.9929578304290771, + "rewards/margins": 2.4168033599853516, + "rewards/rejected": -1.4238457679748535, + "step": 199 + }, + { + "epoch": 0.013909656779218973, + "grad_norm": 33.75, + "learning_rate": 9.99952610934469e-06, + "logits/chosen": -1.7231874465942383, + "logits/rejected": -1.9339120388031006, + "logps/chosen": -351.83001708984375, + "logps/rejected": -364.5266418457031, + "loss": 0.4206, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.18789076805114746, + "rewards/margins": 1.8819249868392944, + "rewards/rejected": -2.0698156356811523, + "step": 200 + }, + { + "epoch": 0.013979205063115068, + "grad_norm": 27.0, + "learning_rate": 9.999521081442584e-06, + "logits/chosen": -1.4979841709136963, + "logits/rejected": -1.5016597509384155, + "logps/chosen": -264.96527099609375, + "logps/rejected": -251.8504638671875, + "loss": 0.3849, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1430889219045639, + "rewards/margins": 1.5848755836486816, + "rewards/rejected": -1.7279644012451172, + "step": 201 + }, + { + "epoch": 0.014048753347011162, + "grad_norm": 49.25, + "learning_rate": 9.999516027009712e-06, + "logits/chosen": -2.508063793182373, + "logits/rejected": -2.726132392883301, + "logps/chosen": -362.24261474609375, + "logps/rejected": -353.7477111816406, + "loss": 0.4242, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.12691323459148407, + "rewards/margins": 1.710160255432129, + "rewards/rejected": -1.5832470655441284, + "step": 202 + }, + { + "epoch": 0.014118301630907257, + "grad_norm": 21.75, + "learning_rate": 9.999510946046102e-06, + "logits/chosen": -2.170590877532959, + "logits/rejected": -2.4350333213806152, + "logps/chosen": -499.0533447265625, + "logps/rejected": -452.04815673828125, + "loss": 0.3403, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.3078782260417938, + "rewards/margins": 1.861323356628418, + "rewards/rejected": -1.5534451007843018, + "step": 203 + }, + { + "epoch": 0.014187849914803352, + "grad_norm": 45.5, + "learning_rate": 9.999505838551778e-06, + "logits/chosen": -2.0342869758605957, + "logits/rejected": -2.1966803073883057, + "logps/chosen": -341.494140625, + "logps/rejected": -346.2054443359375, + "loss": 0.4677, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.3271878957748413, + "rewards/margins": 1.5306628942489624, + "rewards/rejected": -1.8578507900238037, + "step": 204 + }, + { + "epoch": 0.014257398198699447, + "grad_norm": 45.0, + "learning_rate": 9.99950070452677e-06, + "logits/chosen": -1.8647515773773193, + "logits/rejected": -1.882493495941162, + "logps/chosen": -303.35504150390625, + "logps/rejected": -275.32183837890625, + "loss": 0.5136, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.1402844786643982, + "rewards/margins": 1.0074436664581299, + "rewards/rejected": -1.1477282047271729, + "step": 205 + }, + { + "epoch": 0.014326946482595542, + "grad_norm": 54.75, + "learning_rate": 9.999495543971102e-06, + "logits/chosen": -2.0630650520324707, + "logits/rejected": -2.352976083755493, + "logps/chosen": -329.53125, + "logps/rejected": -455.3595886230469, + "loss": 0.5109, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.03590734302997589, + "rewards/margins": 1.6428947448730469, + "rewards/rejected": -1.6069872379302979, + "step": 206 + }, + { + "epoch": 0.014396494766491637, + "grad_norm": 39.75, + "learning_rate": 9.999490356884804e-06, + "logits/chosen": -2.206444501876831, + "logits/rejected": -2.315948486328125, + "logps/chosen": -407.25, + "logps/rejected": -351.0501708984375, + "loss": 0.4578, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.11781606078147888, + "rewards/margins": 1.1693882942199707, + "rewards/rejected": -1.287204384803772, + "step": 207 + }, + { + "epoch": 0.014466043050387732, + "grad_norm": 29.25, + "learning_rate": 9.999485143267904e-06, + "logits/chosen": -1.893094778060913, + "logits/rejected": -2.0974597930908203, + "logps/chosen": -584.1275634765625, + "logps/rejected": -314.01263427734375, + "loss": 0.387, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5091556310653687, + "rewards/margins": 1.7171368598937988, + "rewards/rejected": -1.2079812288284302, + "step": 208 + }, + { + "epoch": 0.014535591334283827, + "grad_norm": 40.75, + "learning_rate": 9.999479903120429e-06, + "logits/chosen": -2.2817187309265137, + "logits/rejected": -2.121245861053467, + "logps/chosen": -520.0316162109375, + "logps/rejected": -320.553955078125, + "loss": 0.4021, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.8350095748901367, + "rewards/margins": 1.5682445764541626, + "rewards/rejected": -0.7332350611686707, + "step": 209 + }, + { + "epoch": 0.014605139618179922, + "grad_norm": 47.5, + "learning_rate": 9.999474636442405e-06, + "logits/chosen": -2.2908873558044434, + "logits/rejected": -2.1331443786621094, + "logps/chosen": -331.3529968261719, + "logps/rejected": -290.57183837890625, + "loss": 0.51, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.4345221519470215, + "rewards/margins": 1.0294880867004395, + "rewards/rejected": -0.5949659943580627, + "step": 210 + }, + { + "epoch": 0.014674687902076017, + "grad_norm": 28.0, + "learning_rate": 9.999469343233862e-06, + "logits/chosen": -2.1385350227355957, + "logits/rejected": -2.025979518890381, + "logps/chosen": -557.4578247070312, + "logps/rejected": -370.121337890625, + "loss": 0.2454, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.8802362680435181, + "rewards/margins": 2.803412437438965, + "rewards/rejected": -1.9231762886047363, + "step": 211 + }, + { + "epoch": 0.014744236185972112, + "grad_norm": 28.75, + "learning_rate": 9.999464023494827e-06, + "logits/chosen": -2.0969138145446777, + "logits/rejected": -2.1559641361236572, + "logps/chosen": -381.64495849609375, + "logps/rejected": -330.919677734375, + "loss": 0.3428, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.19363020360469818, + "rewards/margins": 1.861811876296997, + "rewards/rejected": -1.6681816577911377, + "step": 212 + }, + { + "epoch": 0.014813784469868207, + "grad_norm": 46.75, + "learning_rate": 9.999458677225328e-06, + "logits/chosen": -1.9752206802368164, + "logits/rejected": -1.7805222272872925, + "logps/chosen": -277.7012939453125, + "logps/rejected": -320.5710144042969, + "loss": 0.6447, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.20221933722496033, + "rewards/margins": 0.7222654819488525, + "rewards/rejected": -0.9244848489761353, + "step": 213 + }, + { + "epoch": 0.0148833327537643, + "grad_norm": 59.0, + "learning_rate": 9.999453304425396e-06, + "logits/chosen": -2.1290416717529297, + "logits/rejected": -2.0942046642303467, + "logps/chosen": -397.95556640625, + "logps/rejected": -361.1854248046875, + "loss": 0.5387, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.521556556224823, + "rewards/margins": 1.6347999572753906, + "rewards/rejected": -1.1132434606552124, + "step": 214 + }, + { + "epoch": 0.014952881037660395, + "grad_norm": 34.0, + "learning_rate": 9.999447905095055e-06, + "logits/chosen": -1.9988253116607666, + "logits/rejected": -2.205819606781006, + "logps/chosen": -319.399658203125, + "logps/rejected": -331.62109375, + "loss": 0.3372, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7596837282180786, + "rewards/margins": 1.8261938095092773, + "rewards/rejected": -1.0665102005004883, + "step": 215 + }, + { + "epoch": 0.01502242932155649, + "grad_norm": 36.75, + "learning_rate": 9.999442479234338e-06, + "logits/chosen": -2.0875768661499023, + "logits/rejected": -2.037569046020508, + "logps/chosen": -429.2333984375, + "logps/rejected": -414.30926513671875, + "loss": 0.4189, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.8991371393203735, + "rewards/margins": 1.4311994314193726, + "rewards/rejected": -0.5320623517036438, + "step": 216 + }, + { + "epoch": 0.015091977605452585, + "grad_norm": 32.0, + "learning_rate": 9.999437026843271e-06, + "logits/chosen": -1.76893150806427, + "logits/rejected": -1.7810523509979248, + "logps/chosen": -527.56396484375, + "logps/rejected": -248.86346435546875, + "loss": 0.4131, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.8936858773231506, + "rewards/margins": 1.9587278366088867, + "rewards/rejected": -1.0650420188903809, + "step": 217 + }, + { + "epoch": 0.01516152588934868, + "grad_norm": 47.0, + "learning_rate": 9.999431547921882e-06, + "logits/chosen": -2.1833558082580566, + "logits/rejected": -1.9789881706237793, + "logps/chosen": -348.80419921875, + "logps/rejected": -310.640869140625, + "loss": 0.5368, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.34854382276535034, + "rewards/margins": 0.9944148063659668, + "rewards/rejected": -0.6458710432052612, + "step": 218 + }, + { + "epoch": 0.015231074173244775, + "grad_norm": 40.25, + "learning_rate": 9.999426042470206e-06, + "logits/chosen": -1.778428077697754, + "logits/rejected": -1.8460781574249268, + "logps/chosen": -346.1502685546875, + "logps/rejected": -404.47088623046875, + "loss": 0.3956, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5621379613876343, + "rewards/margins": 1.5693655014038086, + "rewards/rejected": -1.0072274208068848, + "step": 219 + }, + { + "epoch": 0.01530062245714087, + "grad_norm": 44.75, + "learning_rate": 9.999420510488266e-06, + "logits/chosen": -2.06923770904541, + "logits/rejected": -2.0310211181640625, + "logps/chosen": -362.7739562988281, + "logps/rejected": -295.57318115234375, + "loss": 0.6801, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.048435211181640625, + "rewards/margins": 0.3057582676410675, + "rewards/rejected": -0.2573230564594269, + "step": 220 + }, + { + "epoch": 0.015370170741036965, + "grad_norm": 50.75, + "learning_rate": 9.999414951976094e-06, + "logits/chosen": -1.8466346263885498, + "logits/rejected": -2.000486135482788, + "logps/chosen": -413.29638671875, + "logps/rejected": -356.7690124511719, + "loss": 0.5405, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.7737308740615845, + "rewards/margins": 1.4369213581085205, + "rewards/rejected": -0.6631906032562256, + "step": 221 + }, + { + "epoch": 0.01543971902493306, + "grad_norm": 43.5, + "learning_rate": 9.999409366933718e-06, + "logits/chosen": -2.0028762817382812, + "logits/rejected": -2.0464870929718018, + "logps/chosen": -418.5832824707031, + "logps/rejected": -331.453857421875, + "loss": 0.3428, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": 1.017188549041748, + "rewards/margins": 2.2469711303710938, + "rewards/rejected": -1.2297823429107666, + "step": 222 + }, + { + "epoch": 0.015509267308829155, + "grad_norm": 38.5, + "learning_rate": 9.99940375536117e-06, + "logits/chosen": -1.6828348636627197, + "logits/rejected": -1.5994112491607666, + "logps/chosen": -491.88995361328125, + "logps/rejected": -320.45855712890625, + "loss": 0.3759, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.864909291267395, + "rewards/margins": 1.9649956226348877, + "rewards/rejected": -1.1000864505767822, + "step": 223 + }, + { + "epoch": 0.01557881559272525, + "grad_norm": 66.0, + "learning_rate": 9.999398117258478e-06, + "logits/chosen": -2.1209323406219482, + "logits/rejected": -2.3252005577087402, + "logps/chosen": -437.80572509765625, + "logps/rejected": -415.0184326171875, + "loss": 0.4718, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.3348587155342102, + "rewards/margins": 1.8569886684417725, + "rewards/rejected": -1.5221298933029175, + "step": 224 + }, + { + "epoch": 0.015648363876621343, + "grad_norm": 35.0, + "learning_rate": 9.999392452625672e-06, + "logits/chosen": -1.9107904434204102, + "logits/rejected": -1.8322871923446655, + "logps/chosen": -380.5047912597656, + "logps/rejected": -301.3290100097656, + "loss": 0.3947, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.9005970358848572, + "rewards/margins": 1.6747466325759888, + "rewards/rejected": -0.774149477481842, + "step": 225 + }, + { + "epoch": 0.01571791216051744, + "grad_norm": 40.75, + "learning_rate": 9.999386761462782e-06, + "logits/chosen": -1.9186041355133057, + "logits/rejected": -1.9834249019622803, + "logps/chosen": -318.69146728515625, + "logps/rejected": -274.34661865234375, + "loss": 0.4552, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.3906745910644531, + "rewards/margins": 1.7831642627716064, + "rewards/rejected": -1.3924896717071533, + "step": 226 + }, + { + "epoch": 0.015787460444413533, + "grad_norm": 36.75, + "learning_rate": 9.999381043769839e-06, + "logits/chosen": -1.8624000549316406, + "logits/rejected": -1.805497646331787, + "logps/chosen": -377.8289794921875, + "logps/rejected": -275.7497863769531, + "loss": 0.4022, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5157461166381836, + "rewards/margins": 1.6855766773223877, + "rewards/rejected": -1.169830560684204, + "step": 227 + }, + { + "epoch": 0.01585700872830963, + "grad_norm": 37.5, + "learning_rate": 9.999375299546874e-06, + "logits/chosen": -1.8034157752990723, + "logits/rejected": -1.8767738342285156, + "logps/chosen": -291.2948303222656, + "logps/rejected": -303.38519287109375, + "loss": 0.422, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3053823411464691, + "rewards/margins": 1.8321738243103027, + "rewards/rejected": -1.5267914533615112, + "step": 228 + }, + { + "epoch": 0.015926557012205723, + "grad_norm": 34.75, + "learning_rate": 9.999369528793914e-06, + "logits/chosen": -1.9095003604888916, + "logits/rejected": -1.695231556892395, + "logps/chosen": -339.4863586425781, + "logps/rejected": -285.26812744140625, + "loss": 0.5022, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.07680577039718628, + "rewards/margins": 1.3115811347961426, + "rewards/rejected": -1.2347753047943115, + "step": 229 + }, + { + "epoch": 0.01599610529610182, + "grad_norm": 64.0, + "learning_rate": 9.999363731510994e-06, + "logits/chosen": -2.231353282928467, + "logits/rejected": -2.2920260429382324, + "logps/chosen": -396.314453125, + "logps/rejected": -349.11798095703125, + "loss": 0.4677, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.05551982671022415, + "rewards/margins": 1.34597647190094, + "rewards/rejected": -1.2904565334320068, + "step": 230 + }, + { + "epoch": 0.016065653579997913, + "grad_norm": 43.75, + "learning_rate": 9.999357907698143e-06, + "logits/chosen": -1.7359389066696167, + "logits/rejected": -1.8999183177947998, + "logps/chosen": -390.4805908203125, + "logps/rejected": -322.5951232910156, + "loss": 0.5467, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2630568742752075, + "rewards/margins": 1.144690990447998, + "rewards/rejected": -1.407747745513916, + "step": 231 + }, + { + "epoch": 0.016135201863894007, + "grad_norm": 41.0, + "learning_rate": 9.999352057355391e-06, + "logits/chosen": -1.9229111671447754, + "logits/rejected": -1.7530392408370972, + "logps/chosen": -366.5455322265625, + "logps/rejected": -241.37484741210938, + "loss": 0.4318, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.164366215467453, + "rewards/margins": 1.7097082138061523, + "rewards/rejected": -1.5453418493270874, + "step": 232 + }, + { + "epoch": 0.016204750147790103, + "grad_norm": 40.5, + "learning_rate": 9.99934618048277e-06, + "logits/chosen": -1.7808085680007935, + "logits/rejected": -2.227796792984009, + "logps/chosen": -391.28094482421875, + "logps/rejected": -443.85113525390625, + "loss": 0.4698, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.6742685437202454, + "rewards/margins": 2.0273919105529785, + "rewards/rejected": -1.3531233072280884, + "step": 233 + }, + { + "epoch": 0.016274298431686197, + "grad_norm": 29.5, + "learning_rate": 9.999340277080312e-06, + "logits/chosen": -1.9381012916564941, + "logits/rejected": -2.347829580307007, + "logps/chosen": -247.67726135253906, + "logps/rejected": -259.3620300292969, + "loss": 0.443, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.14791643619537354, + "rewards/margins": 1.5651779174804688, + "rewards/rejected": -1.7130944728851318, + "step": 234 + }, + { + "epoch": 0.016343846715582293, + "grad_norm": 36.0, + "learning_rate": 9.999334347148045e-06, + "logits/chosen": -1.9974865913391113, + "logits/rejected": -1.8274736404418945, + "logps/chosen": -297.1540832519531, + "logps/rejected": -300.1636962890625, + "loss": 0.4505, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.3986566662788391, + "rewards/margins": 1.0221314430236816, + "rewards/rejected": -1.4207881689071655, + "step": 235 + }, + { + "epoch": 0.016413394999478387, + "grad_norm": 29.875, + "learning_rate": 9.999328390686004e-06, + "logits/chosen": -2.0631520748138428, + "logits/rejected": -2.158726453781128, + "logps/chosen": -454.1333923339844, + "logps/rejected": -438.62103271484375, + "loss": 0.305, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.0545371770858765, + "rewards/margins": 3.075291156768799, + "rewards/rejected": -2.020753860473633, + "step": 236 + }, + { + "epoch": 0.016482943283374484, + "grad_norm": 48.5, + "learning_rate": 9.999322407694218e-06, + "logits/chosen": -2.1773579120635986, + "logits/rejected": -2.139359951019287, + "logps/chosen": -318.12249755859375, + "logps/rejected": -315.0210876464844, + "loss": 0.4797, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02180662751197815, + "rewards/margins": 1.8442435264587402, + "rewards/rejected": -1.822437047958374, + "step": 237 + }, + { + "epoch": 0.016552491567270577, + "grad_norm": 24.375, + "learning_rate": 9.999316398172722e-06, + "logits/chosen": -1.854806661605835, + "logits/rejected": -1.9650962352752686, + "logps/chosen": -378.99761962890625, + "logps/rejected": -404.9398193359375, + "loss": 0.337, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": -0.014172149822115898, + "rewards/margins": 1.738769292831421, + "rewards/rejected": -1.752941608428955, + "step": 238 + }, + { + "epoch": 0.016622039851166674, + "grad_norm": 48.75, + "learning_rate": 9.999310362121545e-06, + "logits/chosen": -2.0743541717529297, + "logits/rejected": -1.7327954769134521, + "logps/chosen": -308.22222900390625, + "logps/rejected": -266.5066223144531, + "loss": 0.6434, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.6367852687835693, + "rewards/margins": 0.6076662540435791, + "rewards/rejected": -1.2444515228271484, + "step": 239 + }, + { + "epoch": 0.016691588135062767, + "grad_norm": 47.75, + "learning_rate": 9.999304299540722e-06, + "logits/chosen": -1.972684383392334, + "logits/rejected": -2.1013457775115967, + "logps/chosen": -484.92120361328125, + "logps/rejected": -378.56060791015625, + "loss": 0.5046, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1059618592262268, + "rewards/margins": 1.0681445598602295, + "rewards/rejected": -0.9621827602386475, + "step": 240 + }, + { + "epoch": 0.016761136418958864, + "grad_norm": 36.25, + "learning_rate": 9.999298210430281e-06, + "logits/chosen": -1.9245837926864624, + "logits/rejected": -1.7762118577957153, + "logps/chosen": -383.8255920410156, + "logps/rejected": -326.61083984375, + "loss": 0.407, + "rewards/accuracies": 0.6500000357627869, + "rewards/chosen": 0.30191415548324585, + "rewards/margins": 2.217474937438965, + "rewards/rejected": -1.9155609607696533, + "step": 241 + }, + { + "epoch": 0.016830684702854957, + "grad_norm": 38.25, + "learning_rate": 9.999292094790258e-06, + "logits/chosen": -2.155402660369873, + "logits/rejected": -2.232357978820801, + "logps/chosen": -282.2188720703125, + "logps/rejected": -308.3016357421875, + "loss": 0.3999, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.05427606403827667, + "rewards/margins": 1.5494335889816284, + "rewards/rejected": -1.6037096977233887, + "step": 242 + }, + { + "epoch": 0.016900232986751054, + "grad_norm": 62.5, + "learning_rate": 9.999285952620683e-06, + "logits/chosen": -1.9465587139129639, + "logits/rejected": -1.8789935111999512, + "logps/chosen": -421.4027099609375, + "logps/rejected": -390.4100341796875, + "loss": 0.512, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.3320472538471222, + "rewards/margins": 1.5702850818634033, + "rewards/rejected": -1.2382376194000244, + "step": 243 + }, + { + "epoch": 0.016969781270647147, + "grad_norm": 24.375, + "learning_rate": 9.99927978392159e-06, + "logits/chosen": -2.0248632431030273, + "logits/rejected": -1.94873046875, + "logps/chosen": -373.67059326171875, + "logps/rejected": -278.0960693359375, + "loss": 0.3172, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.24709022045135498, + "rewards/margins": 1.9037537574768066, + "rewards/rejected": -1.656663417816162, + "step": 244 + }, + { + "epoch": 0.01703932955454324, + "grad_norm": 49.25, + "learning_rate": 9.99927358869301e-06, + "logits/chosen": -2.117713451385498, + "logits/rejected": -2.3150367736816406, + "logps/chosen": -341.63232421875, + "logps/rejected": -328.9519348144531, + "loss": 0.4857, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.03466576710343361, + "rewards/margins": 1.4635422229766846, + "rewards/rejected": -1.498207926750183, + "step": 245 + }, + { + "epoch": 0.017108877838439337, + "grad_norm": 40.0, + "learning_rate": 9.99926736693498e-06, + "logits/chosen": -2.0632028579711914, + "logits/rejected": -2.325160026550293, + "logps/chosen": -368.76519775390625, + "logps/rejected": -391.046630859375, + "loss": 0.4553, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.20123496651649475, + "rewards/margins": 1.4122544527053833, + "rewards/rejected": -1.211019515991211, + "step": 246 + }, + { + "epoch": 0.01717842612233543, + "grad_norm": 58.75, + "learning_rate": 9.999261118647528e-06, + "logits/chosen": -2.1746110916137695, + "logits/rejected": -1.9165725708007812, + "logps/chosen": -473.6502685546875, + "logps/rejected": -333.47247314453125, + "loss": 0.5082, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.21951411664485931, + "rewards/margins": 1.7226784229278564, + "rewards/rejected": -1.503164291381836, + "step": 247 + }, + { + "epoch": 0.017247974406231527, + "grad_norm": 35.25, + "learning_rate": 9.999254843830689e-06, + "logits/chosen": -1.968876600265503, + "logits/rejected": -2.0403285026550293, + "logps/chosen": -381.8105773925781, + "logps/rejected": -303.3088073730469, + "loss": 0.449, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.34702008962631226, + "rewards/margins": 1.495229721069336, + "rewards/rejected": -1.148209571838379, + "step": 248 + }, + { + "epoch": 0.01731752269012762, + "grad_norm": 31.25, + "learning_rate": 9.999248542484498e-06, + "logits/chosen": -2.301229476928711, + "logits/rejected": -2.461914539337158, + "logps/chosen": -373.0932922363281, + "logps/rejected": -410.11859130859375, + "loss": 0.4076, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.45917776226997375, + "rewards/margins": 1.5300498008728027, + "rewards/rejected": -1.0708720684051514, + "step": 249 + }, + { + "epoch": 0.017387070974023717, + "grad_norm": 37.0, + "learning_rate": 9.999242214608986e-06, + "logits/chosen": -1.8824834823608398, + "logits/rejected": -2.0930964946746826, + "logps/chosen": -307.494873046875, + "logps/rejected": -327.4150085449219, + "loss": 0.4079, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3380080461502075, + "rewards/margins": 1.3163456916809082, + "rewards/rejected": -0.9783376455307007, + "step": 250 + }, + { + "epoch": 0.01745661925791981, + "grad_norm": 49.0, + "learning_rate": 9.999235860204189e-06, + "logits/chosen": -2.1352133750915527, + "logits/rejected": -2.152923583984375, + "logps/chosen": -375.20123291015625, + "logps/rejected": -296.0805358886719, + "loss": 0.5152, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17316222190856934, + "rewards/margins": 1.4771215915679932, + "rewards/rejected": -1.3039593696594238, + "step": 251 + }, + { + "epoch": 0.017526167541815907, + "grad_norm": 25.5, + "learning_rate": 9.999229479270139e-06, + "logits/chosen": -2.130035877227783, + "logits/rejected": -1.985361099243164, + "logps/chosen": -328.40802001953125, + "logps/rejected": -295.8741455078125, + "loss": 0.2836, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 0.6370598673820496, + "rewards/margins": 2.1251730918884277, + "rewards/rejected": -1.4881134033203125, + "step": 252 + }, + { + "epoch": 0.017595715825712, + "grad_norm": 48.5, + "learning_rate": 9.99922307180687e-06, + "logits/chosen": -2.276228666305542, + "logits/rejected": -2.4257545471191406, + "logps/chosen": -349.7898254394531, + "logps/rejected": -471.5659484863281, + "loss": 0.4874, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.05240285396575928, + "rewards/margins": 1.5981619358062744, + "rewards/rejected": -1.5457589626312256, + "step": 253 + }, + { + "epoch": 0.017665264109608097, + "grad_norm": 31.125, + "learning_rate": 9.999216637814415e-06, + "logits/chosen": -1.5717895030975342, + "logits/rejected": -1.810215711593628, + "logps/chosen": -320.3521423339844, + "logps/rejected": -370.4796142578125, + "loss": 0.3598, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.049725256860256195, + "rewards/margins": 1.6558666229248047, + "rewards/rejected": -1.7055919170379639, + "step": 254 + }, + { + "epoch": 0.01773481239350419, + "grad_norm": 42.0, + "learning_rate": 9.999210177292812e-06, + "logits/chosen": -1.9567430019378662, + "logits/rejected": -2.0528078079223633, + "logps/chosen": -427.17816162109375, + "logps/rejected": -291.81561279296875, + "loss": 0.4513, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.42975619435310364, + "rewards/margins": 1.5515944957733154, + "rewards/rejected": -1.1218382120132446, + "step": 255 + }, + { + "epoch": 0.017804360677400283, + "grad_norm": 36.0, + "learning_rate": 9.99920369024209e-06, + "logits/chosen": -1.9330898523330688, + "logits/rejected": -2.080015182495117, + "logps/chosen": -423.4591064453125, + "logps/rejected": -400.15869140625, + "loss": 0.3334, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5689089298248291, + "rewards/margins": 2.277834892272949, + "rewards/rejected": -1.7089259624481201, + "step": 256 + }, + { + "epoch": 0.01787390896129638, + "grad_norm": 28.125, + "learning_rate": 9.999197176662288e-06, + "logits/chosen": -1.8132336139678955, + "logits/rejected": -1.7665352821350098, + "logps/chosen": -542.0451049804688, + "logps/rejected": -353.46368408203125, + "loss": 0.223, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.5211124420166016, + "rewards/margins": 3.5217456817626953, + "rewards/rejected": -2.000633478164673, + "step": 257 + }, + { + "epoch": 0.017943457245192473, + "grad_norm": 35.0, + "learning_rate": 9.999190636553437e-06, + "logits/chosen": -2.043771266937256, + "logits/rejected": -1.9034068584442139, + "logps/chosen": -435.72442626953125, + "logps/rejected": -298.32379150390625, + "loss": 0.394, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.6523581147193909, + "rewards/margins": 2.1012043952941895, + "rewards/rejected": -1.4488463401794434, + "step": 258 + }, + { + "epoch": 0.01801300552908857, + "grad_norm": 43.25, + "learning_rate": 9.999184069915575e-06, + "logits/chosen": -2.2854669094085693, + "logits/rejected": -2.248509407043457, + "logps/chosen": -356.4296875, + "logps/rejected": -279.60504150390625, + "loss": 0.509, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.06103089451789856, + "rewards/margins": 1.183120608329773, + "rewards/rejected": -1.2441514730453491, + "step": 259 + }, + { + "epoch": 0.018082553812984663, + "grad_norm": 30.0, + "learning_rate": 9.999177476748733e-06, + "logits/chosen": -1.952649474143982, + "logits/rejected": -2.0532007217407227, + "logps/chosen": -366.17510986328125, + "logps/rejected": -315.7154846191406, + "loss": 0.4089, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3826763927936554, + "rewards/margins": 1.4696617126464844, + "rewards/rejected": -1.0869852304458618, + "step": 260 + }, + { + "epoch": 0.01815210209688076, + "grad_norm": 43.25, + "learning_rate": 9.999170857052951e-06, + "logits/chosen": -1.9158421754837036, + "logits/rejected": -2.030740737915039, + "logps/chosen": -345.9456787109375, + "logps/rejected": -290.9457092285156, + "loss": 0.536, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.4040408134460449, + "rewards/margins": 1.2132439613342285, + "rewards/rejected": -0.8092031478881836, + "step": 261 + }, + { + "epoch": 0.018221650380776853, + "grad_norm": 37.25, + "learning_rate": 9.999164210828261e-06, + "logits/chosen": -1.892512321472168, + "logits/rejected": -1.7746834754943848, + "logps/chosen": -328.541748046875, + "logps/rejected": -271.9040222167969, + "loss": 0.485, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.034983254969120026, + "rewards/margins": 1.3926091194152832, + "rewards/rejected": -1.3576257228851318, + "step": 262 + }, + { + "epoch": 0.01829119866467295, + "grad_norm": 41.75, + "learning_rate": 9.999157538074698e-06, + "logits/chosen": -2.017751455307007, + "logits/rejected": -2.0616979598999023, + "logps/chosen": -345.38922119140625, + "logps/rejected": -299.02972412109375, + "loss": 0.3812, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5056847333908081, + "rewards/margins": 1.9595838785171509, + "rewards/rejected": -1.4538992643356323, + "step": 263 + }, + { + "epoch": 0.018360746948569043, + "grad_norm": 33.5, + "learning_rate": 9.999150838792298e-06, + "logits/chosen": -1.9069550037384033, + "logits/rejected": -1.977041482925415, + "logps/chosen": -376.51171875, + "logps/rejected": -351.40496826171875, + "loss": 0.3283, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.8595783710479736, + "rewards/margins": 2.2939395904541016, + "rewards/rejected": -1.434361219406128, + "step": 264 + }, + { + "epoch": 0.01843029523246514, + "grad_norm": 41.0, + "learning_rate": 9.999144112981096e-06, + "logits/chosen": -1.8375849723815918, + "logits/rejected": -1.9802782535552979, + "logps/chosen": -327.59295654296875, + "logps/rejected": -270.1514892578125, + "loss": 0.459, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.597463846206665, + "rewards/margins": 1.5133674144744873, + "rewards/rejected": -0.9159038066864014, + "step": 265 + }, + { + "epoch": 0.018499843516361233, + "grad_norm": 23.875, + "learning_rate": 9.999137360641128e-06, + "logits/chosen": -2.055659294128418, + "logits/rejected": -2.0888304710388184, + "logps/chosen": -400.56097412109375, + "logps/rejected": -344.36932373046875, + "loss": 0.3599, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.7137490510940552, + "rewards/margins": 1.7756967544555664, + "rewards/rejected": -1.0619478225708008, + "step": 266 + }, + { + "epoch": 0.01856939180025733, + "grad_norm": 31.25, + "learning_rate": 9.999130581772431e-06, + "logits/chosen": -2.02939510345459, + "logits/rejected": -1.6350780725479126, + "logps/chosen": -348.89959716796875, + "logps/rejected": -213.58013916015625, + "loss": 0.3615, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.014988660812378, + "rewards/margins": 1.8006885051727295, + "rewards/rejected": -0.7856996655464172, + "step": 267 + }, + { + "epoch": 0.018638940084153423, + "grad_norm": 37.5, + "learning_rate": 9.99912377637504e-06, + "logits/chosen": -1.973205804824829, + "logits/rejected": -1.866929531097412, + "logps/chosen": -354.51983642578125, + "logps/rejected": -286.53662109375, + "loss": 0.4345, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.7219036817550659, + "rewards/margins": 1.581897258758545, + "rewards/rejected": -0.859993577003479, + "step": 268 + }, + { + "epoch": 0.018708488368049517, + "grad_norm": 18.375, + "learning_rate": 9.999116944448991e-06, + "logits/chosen": -1.8487286567687988, + "logits/rejected": -1.7855963706970215, + "logps/chosen": -318.345703125, + "logps/rejected": -247.75784301757812, + "loss": 0.3191, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 1.1773269176483154, + "rewards/margins": 1.6138646602630615, + "rewards/rejected": -0.4365377426147461, + "step": 269 + }, + { + "epoch": 0.018778036651945613, + "grad_norm": 31.25, + "learning_rate": 9.99911008599432e-06, + "logits/chosen": -1.6773293018341064, + "logits/rejected": -1.5826219320297241, + "logps/chosen": -370.059814453125, + "logps/rejected": -289.1189270019531, + "loss": 0.5226, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.2090034484863281, + "rewards/margins": 1.1156110763549805, + "rewards/rejected": 0.09339238703250885, + "step": 270 + }, + { + "epoch": 0.018847584935841707, + "grad_norm": 33.75, + "learning_rate": 9.999103201011065e-06, + "logits/chosen": -1.7550854682922363, + "logits/rejected": -1.9379616975784302, + "logps/chosen": -289.718505859375, + "logps/rejected": -349.5652770996094, + "loss": 0.5581, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4088086783885956, + "rewards/margins": 0.8532041907310486, + "rewards/rejected": -0.4443955421447754, + "step": 271 + }, + { + "epoch": 0.018917133219737804, + "grad_norm": 36.25, + "learning_rate": 9.99909628949926e-06, + "logits/chosen": -2.0328879356384277, + "logits/rejected": -2.362194061279297, + "logps/chosen": -361.02105712890625, + "logps/rejected": -228.35421752929688, + "loss": 0.3963, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.3291140794754028, + "rewards/margins": 1.8335614204406738, + "rewards/rejected": -0.5044471025466919, + "step": 272 + }, + { + "epoch": 0.018986681503633897, + "grad_norm": 39.0, + "learning_rate": 9.999089351458944e-06, + "logits/chosen": -1.8877999782562256, + "logits/rejected": -2.057575225830078, + "logps/chosen": -520.3101806640625, + "logps/rejected": -408.4241943359375, + "loss": 0.4102, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.7469838857650757, + "rewards/margins": 1.9468050003051758, + "rewards/rejected": -0.1998211145401001, + "step": 273 + }, + { + "epoch": 0.019056229787529994, + "grad_norm": 41.25, + "learning_rate": 9.999082386890154e-06, + "logits/chosen": -2.2060647010803223, + "logits/rejected": -1.9223867654800415, + "logps/chosen": -359.99066162109375, + "logps/rejected": -323.9928283691406, + "loss": 0.3881, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 1.3157947063446045, + "rewards/margins": 1.4769890308380127, + "rewards/rejected": -0.16119423508644104, + "step": 274 + }, + { + "epoch": 0.019125778071426087, + "grad_norm": 39.75, + "learning_rate": 9.999075395792923e-06, + "logits/chosen": -2.0603184700012207, + "logits/rejected": -2.1260790824890137, + "logps/chosen": -411.1520690917969, + "logps/rejected": -282.4197082519531, + "loss": 0.3786, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.8551716804504395, + "rewards/margins": 1.566945195198059, + "rewards/rejected": 0.2882263660430908, + "step": 275 + }, + { + "epoch": 0.019195326355322184, + "grad_norm": 52.5, + "learning_rate": 9.999068378167293e-06, + "logits/chosen": -1.8617322444915771, + "logits/rejected": -1.7675507068634033, + "logps/chosen": -421.9911804199219, + "logps/rejected": -277.80462646484375, + "loss": 0.6131, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 1.0071754455566406, + "rewards/margins": 1.0919893980026245, + "rewards/rejected": -0.08481380343437195, + "step": 276 + }, + { + "epoch": 0.019264874639218277, + "grad_norm": 26.125, + "learning_rate": 9.999061334013298e-06, + "logits/chosen": -1.8848862648010254, + "logits/rejected": -2.056020736694336, + "logps/chosen": -295.5935974121094, + "logps/rejected": -275.97003173828125, + "loss": 0.3869, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.0547502040863037, + "rewards/margins": 1.831303358078003, + "rewards/rejected": -0.7765531539916992, + "step": 277 + }, + { + "epoch": 0.019334422923114374, + "grad_norm": 44.5, + "learning_rate": 9.999054263330978e-06, + "logits/chosen": -2.1748833656311035, + "logits/rejected": -2.059750556945801, + "logps/chosen": -488.92083740234375, + "logps/rejected": -453.1601867675781, + "loss": 0.2898, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 1.4932392835617065, + "rewards/margins": 2.5335326194763184, + "rewards/rejected": -1.0402933359146118, + "step": 278 + }, + { + "epoch": 0.019403971207010467, + "grad_norm": 41.25, + "learning_rate": 9.999047166120366e-06, + "logits/chosen": -2.1607184410095215, + "logits/rejected": -2.0414276123046875, + "logps/chosen": -341.5518798828125, + "logps/rejected": -322.6283874511719, + "loss": 0.5431, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.9114878177642822, + "rewards/margins": 1.2505518198013306, + "rewards/rejected": -0.33906394243240356, + "step": 279 + }, + { + "epoch": 0.019473519490906564, + "grad_norm": 45.0, + "learning_rate": 9.999040042381507e-06, + "logits/chosen": -2.4068446159362793, + "logits/rejected": -2.4361181259155273, + "logps/chosen": -393.3025817871094, + "logps/rejected": -364.72491455078125, + "loss": 0.4627, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.590602457523346, + "rewards/margins": 1.0786361694335938, + "rewards/rejected": -0.4880337417125702, + "step": 280 + }, + { + "epoch": 0.019543067774802657, + "grad_norm": 57.25, + "learning_rate": 9.99903289211443e-06, + "logits/chosen": -1.5447850227355957, + "logits/rejected": -1.6932637691497803, + "logps/chosen": -291.7900695800781, + "logps/rejected": -374.8736877441406, + "loss": 0.3376, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7603626847267151, + "rewards/margins": 2.660614490509033, + "rewards/rejected": -1.900251865386963, + "step": 281 + }, + { + "epoch": 0.01961261605869875, + "grad_norm": 42.0, + "learning_rate": 9.999025715319182e-06, + "logits/chosen": -1.94408118724823, + "logits/rejected": -2.0913758277893066, + "logps/chosen": -290.3719787597656, + "logps/rejected": -358.0947570800781, + "loss": 0.5035, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18870389461517334, + "rewards/margins": 1.1102834939956665, + "rewards/rejected": -0.9215795397758484, + "step": 282 + }, + { + "epoch": 0.019682164342594847, + "grad_norm": 27.0, + "learning_rate": 9.999018511995793e-06, + "logits/chosen": -2.2143492698669434, + "logits/rejected": -2.338221549987793, + "logps/chosen": -377.51513671875, + "logps/rejected": -341.7450866699219, + "loss": 0.3147, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.36862418055534363, + "rewards/margins": 2.272181987762451, + "rewards/rejected": -1.9035576581954956, + "step": 283 + }, + { + "epoch": 0.01975171262649094, + "grad_norm": 25.0, + "learning_rate": 9.999011282144307e-06, + "logits/chosen": -1.431140422821045, + "logits/rejected": -1.325681447982788, + "logps/chosen": -297.9659118652344, + "logps/rejected": -277.4363098144531, + "loss": 0.3559, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.3559146225452423, + "rewards/margins": 1.8540118932724, + "rewards/rejected": -1.49809730052948, + "step": 284 + }, + { + "epoch": 0.019821260910387037, + "grad_norm": 27.125, + "learning_rate": 9.99900402576476e-06, + "logits/chosen": -1.6127164363861084, + "logits/rejected": -2.0458879470825195, + "logps/chosen": -299.2956848144531, + "logps/rejected": -365.5072937011719, + "loss": 0.3344, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3525601625442505, + "rewards/margins": 2.5273356437683105, + "rewards/rejected": -2.1747756004333496, + "step": 285 + }, + { + "epoch": 0.01989080919428313, + "grad_norm": 44.0, + "learning_rate": 9.99899674285719e-06, + "logits/chosen": -2.0606703758239746, + "logits/rejected": -1.9583499431610107, + "logps/chosen": -366.701171875, + "logps/rejected": -269.7425231933594, + "loss": 0.3534, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5796060562133789, + "rewards/margins": 2.2505855560302734, + "rewards/rejected": -1.6709792613983154, + "step": 286 + }, + { + "epoch": 0.019960357478179227, + "grad_norm": 44.25, + "learning_rate": 9.998989433421635e-06, + "logits/chosen": -2.0142886638641357, + "logits/rejected": -2.185579299926758, + "logps/chosen": -417.862548828125, + "logps/rejected": -349.5841369628906, + "loss": 0.5221, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.05364346504211426, + "rewards/margins": 1.947020173072815, + "rewards/rejected": -1.8933767080307007, + "step": 287 + }, + { + "epoch": 0.02002990576207532, + "grad_norm": 29.125, + "learning_rate": 9.998982097458136e-06, + "logits/chosen": -2.3112895488739014, + "logits/rejected": -2.0557122230529785, + "logps/chosen": -372.5518493652344, + "logps/rejected": -268.6443176269531, + "loss": 0.3943, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.265166312456131, + "rewards/margins": 1.4740440845489502, + "rewards/rejected": -1.2088778018951416, + "step": 288 + }, + { + "epoch": 0.020099454045971417, + "grad_norm": 38.0, + "learning_rate": 9.998974734966733e-06, + "logits/chosen": -2.1377861499786377, + "logits/rejected": -1.8949990272521973, + "logps/chosen": -308.3575439453125, + "logps/rejected": -319.361572265625, + "loss": 0.412, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": -0.38304007053375244, + "rewards/margins": 1.314139485359192, + "rewards/rejected": -1.6971795558929443, + "step": 289 + }, + { + "epoch": 0.02016900232986751, + "grad_norm": 40.25, + "learning_rate": 9.998967345947461e-06, + "logits/chosen": -2.086866855621338, + "logits/rejected": -2.2328994274139404, + "logps/chosen": -435.4238586425781, + "logps/rejected": -466.0227966308594, + "loss": 0.4459, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.7057145833969116, + "rewards/margins": 2.0903077125549316, + "rewards/rejected": -1.3845932483673096, + "step": 290 + }, + { + "epoch": 0.020238550613763607, + "grad_norm": 21.875, + "learning_rate": 9.998959930400362e-06, + "logits/chosen": -1.8002294301986694, + "logits/rejected": -1.7629457712173462, + "logps/chosen": -332.99639892578125, + "logps/rejected": -227.27084350585938, + "loss": 0.3268, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.24261386692523956, + "rewards/margins": 2.154963493347168, + "rewards/rejected": -1.9123495817184448, + "step": 291 + }, + { + "epoch": 0.0203080988976597, + "grad_norm": 32.0, + "learning_rate": 9.998952488325476e-06, + "logits/chosen": -1.7260394096374512, + "logits/rejected": -1.7405188083648682, + "logps/chosen": -351.88958740234375, + "logps/rejected": -295.6546630859375, + "loss": 0.3212, + "rewards/accuracies": 0.9500000476837158, + "rewards/chosen": 0.5101504325866699, + "rewards/margins": 2.341210126876831, + "rewards/rejected": -1.8310596942901611, + "step": 292 + }, + { + "epoch": 0.020377647181555793, + "grad_norm": 24.875, + "learning_rate": 9.998945019722838e-06, + "logits/chosen": -2.009561061859131, + "logits/rejected": -1.9932881593704224, + "logps/chosen": -396.3478088378906, + "logps/rejected": -364.591064453125, + "loss": 0.3438, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": -0.01580490544438362, + "rewards/margins": 1.7669460773468018, + "rewards/rejected": -1.7827508449554443, + "step": 293 + }, + { + "epoch": 0.02044719546545189, + "grad_norm": 43.25, + "learning_rate": 9.998937524592493e-06, + "logits/chosen": -2.5986318588256836, + "logits/rejected": -2.6334242820739746, + "logps/chosen": -395.09344482421875, + "logps/rejected": -348.66729736328125, + "loss": 0.4422, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.04624226689338684, + "rewards/margins": 1.382677435874939, + "rewards/rejected": -1.4289196729660034, + "step": 294 + }, + { + "epoch": 0.020516743749347983, + "grad_norm": 66.5, + "learning_rate": 9.998930002934478e-06, + "logits/chosen": -2.058396339416504, + "logits/rejected": -2.366825580596924, + "logps/chosen": -391.14935302734375, + "logps/rejected": -405.056640625, + "loss": 0.6272, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.005402460694313049, + "rewards/margins": 0.8890234231948853, + "rewards/rejected": -0.8944259285926819, + "step": 295 + }, + { + "epoch": 0.02058629203324408, + "grad_norm": 48.75, + "learning_rate": 9.998922454748833e-06, + "logits/chosen": -2.1712827682495117, + "logits/rejected": -2.2793936729431152, + "logps/chosen": -489.3028564453125, + "logps/rejected": -356.74798583984375, + "loss": 0.3844, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4628851413726807, + "rewards/margins": 1.7916350364685059, + "rewards/rejected": -0.3287498652935028, + "step": 296 + }, + { + "epoch": 0.020655840317140173, + "grad_norm": 57.75, + "learning_rate": 9.998914880035599e-06, + "logits/chosen": -1.8585426807403564, + "logits/rejected": -1.8981800079345703, + "logps/chosen": -367.3919677734375, + "logps/rejected": -306.17059326171875, + "loss": 0.488, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 1.3362407684326172, + "rewards/margins": 1.7694376707077026, + "rewards/rejected": -0.43319693207740784, + "step": 297 + }, + { + "epoch": 0.02072538860103627, + "grad_norm": 24.75, + "learning_rate": 9.998907278794816e-06, + "logits/chosen": -1.7051329612731934, + "logits/rejected": -1.817283034324646, + "logps/chosen": -252.1783447265625, + "logps/rejected": -263.1280517578125, + "loss": 0.3194, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.8184145092964172, + "rewards/margins": 1.968191146850586, + "rewards/rejected": -1.149776816368103, + "step": 298 + }, + { + "epoch": 0.020794936884932363, + "grad_norm": 38.75, + "learning_rate": 9.998899651026524e-06, + "logits/chosen": -1.9182977676391602, + "logits/rejected": -1.9459434747695923, + "logps/chosen": -289.7865905761719, + "logps/rejected": -331.1701965332031, + "loss": 0.3551, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.4589896202087402, + "rewards/margins": 2.1429691314697266, + "rewards/rejected": -0.6839797496795654, + "step": 299 + }, + { + "epoch": 0.02086448516882846, + "grad_norm": 33.75, + "learning_rate": 9.998891996730764e-06, + "logits/chosen": -1.7773990631103516, + "logits/rejected": -1.9897887706756592, + "logps/chosen": -246.54998779296875, + "logps/rejected": -169.277099609375, + "loss": 0.4414, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.24934983253479, + "rewards/margins": 1.412894368171692, + "rewards/rejected": -0.16354450583457947, + "step": 300 + }, + { + "epoch": 0.020934033452724553, + "grad_norm": 44.5, + "learning_rate": 9.998884315907575e-06, + "logits/chosen": -1.6004377603530884, + "logits/rejected": -1.566001057624817, + "logps/chosen": -354.409912109375, + "logps/rejected": -311.75927734375, + "loss": 0.4477, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 1.3121652603149414, + "rewards/margins": 1.5136685371398926, + "rewards/rejected": -0.20150315761566162, + "step": 301 + }, + { + "epoch": 0.02100358173662065, + "grad_norm": 34.5, + "learning_rate": 9.998876608557e-06, + "logits/chosen": -1.7888816595077515, + "logits/rejected": -1.7754836082458496, + "logps/chosen": -245.51495361328125, + "logps/rejected": -256.879150390625, + "loss": 0.5604, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.1313426494598389, + "rewards/margins": 0.7505848407745361, + "rewards/rejected": 0.3807578980922699, + "step": 302 + }, + { + "epoch": 0.021073130020516743, + "grad_norm": 30.25, + "learning_rate": 9.998868874679077e-06, + "logits/chosen": -1.6252179145812988, + "logits/rejected": -1.4771788120269775, + "logps/chosen": -305.2787170410156, + "logps/rejected": -217.00079345703125, + "loss": 0.464, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.994475245475769, + "rewards/margins": 1.0974442958831787, + "rewards/rejected": -0.10296905040740967, + "step": 303 + }, + { + "epoch": 0.02114267830441284, + "grad_norm": 42.5, + "learning_rate": 9.998861114273852e-06, + "logits/chosen": -2.0050337314605713, + "logits/rejected": -2.000460624694824, + "logps/chosen": -486.90692138671875, + "logps/rejected": -367.17327880859375, + "loss": 0.4423, + "rewards/accuracies": 0.75, + "rewards/chosen": 1.913007378578186, + "rewards/margins": 1.5107090473175049, + "rewards/rejected": 0.402298241853714, + "step": 304 + }, + { + "epoch": 0.021212226588308934, + "grad_norm": 37.75, + "learning_rate": 9.998853327341361e-06, + "logits/chosen": -1.7839711904525757, + "logits/rejected": -1.640040397644043, + "logps/chosen": -416.451416015625, + "logps/rejected": -337.4923095703125, + "loss": 0.4476, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.5019378662109375, + "rewards/margins": 1.4056788682937622, + "rewards/rejected": 0.09625905752182007, + "step": 305 + }, + { + "epoch": 0.021281774872205027, + "grad_norm": 33.75, + "learning_rate": 9.998845513881648e-06, + "logits/chosen": -1.8248884677886963, + "logits/rejected": -1.6131434440612793, + "logps/chosen": -364.4690856933594, + "logps/rejected": -202.36380004882812, + "loss": 0.4044, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.2310272455215454, + "rewards/margins": 2.2845940589904785, + "rewards/rejected": -1.053566813468933, + "step": 306 + }, + { + "epoch": 0.021351323156101124, + "grad_norm": 48.0, + "learning_rate": 9.998837673894752e-06, + "logits/chosen": -1.9052352905273438, + "logits/rejected": -2.1415843963623047, + "logps/chosen": -426.2255859375, + "logps/rejected": -317.00970458984375, + "loss": 0.4554, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 1.3588393926620483, + "rewards/margins": 2.0879080295562744, + "rewards/rejected": -0.7290686964988708, + "step": 307 + }, + { + "epoch": 0.021420871439997217, + "grad_norm": 54.25, + "learning_rate": 9.998829807380719e-06, + "logits/chosen": -2.254457950592041, + "logits/rejected": -2.189211368560791, + "logps/chosen": -424.0947265625, + "logps/rejected": -376.9266052246094, + "loss": 0.4862, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.9382611513137817, + "rewards/margins": 1.4721547365188599, + "rewards/rejected": -0.5338935852050781, + "step": 308 + }, + { + "epoch": 0.021490419723893314, + "grad_norm": 25.75, + "learning_rate": 9.998821914339585e-06, + "logits/chosen": -1.7740572690963745, + "logits/rejected": -1.8833811283111572, + "logps/chosen": -448.5695495605469, + "logps/rejected": -317.6328430175781, + "loss": 0.2449, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1171178817749023, + "rewards/margins": 2.648395538330078, + "rewards/rejected": -1.5312775373458862, + "step": 309 + }, + { + "epoch": 0.021559968007789407, + "grad_norm": 44.5, + "learning_rate": 9.998813994771398e-06, + "logits/chosen": -1.8591368198394775, + "logits/rejected": -1.8577395677566528, + "logps/chosen": -315.5296630859375, + "logps/rejected": -298.69091796875, + "loss": 0.5297, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.13747955858707428, + "rewards/margins": 1.4082032442092896, + "rewards/rejected": -1.2707237005233765, + "step": 310 + }, + { + "epoch": 0.021629516291685504, + "grad_norm": 31.125, + "learning_rate": 9.998806048676196e-06, + "logits/chosen": -1.8708704710006714, + "logits/rejected": -1.724186897277832, + "logps/chosen": -388.44940185546875, + "logps/rejected": -347.62054443359375, + "loss": 0.3245, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": -0.012494474649429321, + "rewards/margins": 2.352285861968994, + "rewards/rejected": -2.3647804260253906, + "step": 311 + }, + { + "epoch": 0.021699064575581597, + "grad_norm": 33.0, + "learning_rate": 9.998798076054022e-06, + "logits/chosen": -2.1707489490509033, + "logits/rejected": -2.4340691566467285, + "logps/chosen": -348.42047119140625, + "logps/rejected": -352.95355224609375, + "loss": 0.295, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5266488790512085, + "rewards/margins": 1.8243894577026367, + "rewards/rejected": -1.2977404594421387, + "step": 312 + }, + { + "epoch": 0.021768612859477694, + "grad_norm": 50.0, + "learning_rate": 9.99879007690492e-06, + "logits/chosen": -2.056274890899658, + "logits/rejected": -2.159529685974121, + "logps/chosen": -422.12091064453125, + "logps/rejected": -317.92779541015625, + "loss": 0.5393, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 1.121394395828247, + "rewards/margins": 1.6500221490859985, + "rewards/rejected": -0.5286278128623962, + "step": 313 + }, + { + "epoch": 0.021838161143373787, + "grad_norm": 29.375, + "learning_rate": 9.99878205122893e-06, + "logits/chosen": -1.6740485429763794, + "logits/rejected": -1.5881948471069336, + "logps/chosen": -482.0579833984375, + "logps/rejected": -389.80877685546875, + "loss": 0.279, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.3146497011184692, + "rewards/margins": 2.7426650524139404, + "rewards/rejected": -1.4280154705047607, + "step": 314 + }, + { + "epoch": 0.021907709427269884, + "grad_norm": 64.5, + "learning_rate": 9.998773999026096e-06, + "logits/chosen": -2.017714500427246, + "logits/rejected": -2.201174736022949, + "logps/chosen": -519.3243408203125, + "logps/rejected": -440.4840087890625, + "loss": 0.3232, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.6596448421478271, + "rewards/margins": 3.579893112182617, + "rewards/rejected": -1.920248031616211, + "step": 315 + }, + { + "epoch": 0.021977257711165977, + "grad_norm": 27.5, + "learning_rate": 9.99876592029646e-06, + "logits/chosen": -2.1967852115631104, + "logits/rejected": -2.0210695266723633, + "logps/chosen": -350.0119934082031, + "logps/rejected": -240.9980926513672, + "loss": 0.3896, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.11416347324848175, + "rewards/margins": 1.5630916357040405, + "rewards/rejected": -1.4489281177520752, + "step": 316 + }, + { + "epoch": 0.02204680599506207, + "grad_norm": 45.0, + "learning_rate": 9.998757815040066e-06, + "logits/chosen": -2.1014249324798584, + "logits/rejected": -1.9721131324768066, + "logps/chosen": -375.44439697265625, + "logps/rejected": -319.7156677246094, + "loss": 0.4698, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.3913065791130066, + "rewards/margins": 1.5011645555496216, + "rewards/rejected": -1.8924710750579834, + "step": 317 + }, + { + "epoch": 0.022116354278958167, + "grad_norm": 39.0, + "learning_rate": 9.998749683256954e-06, + "logits/chosen": -1.8546777963638306, + "logits/rejected": -2.0065112113952637, + "logps/chosen": -274.4776916503906, + "logps/rejected": -377.161865234375, + "loss": 0.3888, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3553781509399414, + "rewards/margins": 1.5443460941314697, + "rewards/rejected": -1.8997243642807007, + "step": 318 + }, + { + "epoch": 0.02218590256285426, + "grad_norm": 54.0, + "learning_rate": 9.998741524947172e-06, + "logits/chosen": -2.1966562271118164, + "logits/rejected": -1.9962949752807617, + "logps/chosen": -373.61785888671875, + "logps/rejected": -375.5668640136719, + "loss": 0.5399, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.26897451281547546, + "rewards/margins": 2.1926229000091553, + "rewards/rejected": -1.9236483573913574, + "step": 319 + }, + { + "epoch": 0.022255450846750357, + "grad_norm": 42.75, + "learning_rate": 9.998733340110757e-06, + "logits/chosen": -1.847837209701538, + "logits/rejected": -1.6881626844406128, + "logps/chosen": -432.6614685058594, + "logps/rejected": -337.90869140625, + "loss": 0.5013, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5641430616378784, + "rewards/margins": 1.6337227821350098, + "rewards/rejected": -1.0695797204971313, + "step": 320 + }, + { + "epoch": 0.02232499913064645, + "grad_norm": 25.375, + "learning_rate": 9.99872512874776e-06, + "logits/chosen": -2.0616447925567627, + "logits/rejected": -2.1580724716186523, + "logps/chosen": -390.9245300292969, + "logps/rejected": -318.7636413574219, + "loss": 0.3056, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.12200465798377991, + "rewards/margins": 2.0148186683654785, + "rewards/rejected": -2.1368231773376465, + "step": 321 + }, + { + "epoch": 0.022394547414542547, + "grad_norm": 30.125, + "learning_rate": 9.998716890858219e-06, + "logits/chosen": -2.104548931121826, + "logits/rejected": -2.1644372940063477, + "logps/chosen": -312.1165466308594, + "logps/rejected": -405.45086669921875, + "loss": 0.338, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.16226555407047272, + "rewards/margins": 2.181790828704834, + "rewards/rejected": -2.0195252895355225, + "step": 322 + }, + { + "epoch": 0.02246409569843864, + "grad_norm": 57.0, + "learning_rate": 9.99870862644218e-06, + "logits/chosen": -2.013852596282959, + "logits/rejected": -2.128829002380371, + "logps/chosen": -302.274169921875, + "logps/rejected": -254.9356689453125, + "loss": 0.5555, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.227697491645813, + "rewards/margins": 1.2711269855499268, + "rewards/rejected": -1.4988245964050293, + "step": 323 + }, + { + "epoch": 0.022533643982334737, + "grad_norm": 28.25, + "learning_rate": 9.998700335499684e-06, + "logits/chosen": -2.206881523132324, + "logits/rejected": -1.8986985683441162, + "logps/chosen": -441.75323486328125, + "logps/rejected": -254.06561279296875, + "loss": 0.2646, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.701343834400177, + "rewards/margins": 3.485591411590576, + "rewards/rejected": -2.784248113632202, + "step": 324 + }, + { + "epoch": 0.02260319226623083, + "grad_norm": 34.25, + "learning_rate": 9.99869201803078e-06, + "logits/chosen": -2.034043550491333, + "logits/rejected": -2.1030874252319336, + "logps/chosen": -395.4072265625, + "logps/rejected": -272.001220703125, + "loss": 0.386, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.14785130321979523, + "rewards/margins": 2.105660915374756, + "rewards/rejected": -2.253511905670166, + "step": 325 + }, + { + "epoch": 0.022672740550126927, + "grad_norm": 32.25, + "learning_rate": 9.99868367403551e-06, + "logits/chosen": -1.8006889820098877, + "logits/rejected": -1.6895575523376465, + "logps/chosen": -276.9715576171875, + "logps/rejected": -214.91058349609375, + "loss": 0.4283, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": -0.4706474542617798, + "rewards/margins": 1.6913641691207886, + "rewards/rejected": -2.1620116233825684, + "step": 326 + }, + { + "epoch": 0.02274228883402302, + "grad_norm": 69.0, + "learning_rate": 9.998675303513916e-06, + "logits/chosen": -2.351449966430664, + "logits/rejected": -2.6502959728240967, + "logps/chosen": -514.0968627929688, + "logps/rejected": -449.12261962890625, + "loss": 0.5839, + "rewards/accuracies": 0.6500000357627869, + "rewards/chosen": -0.32216402888298035, + "rewards/margins": 1.6712496280670166, + "rewards/rejected": -1.9934134483337402, + "step": 327 + }, + { + "epoch": 0.022811837117919117, + "grad_norm": 35.25, + "learning_rate": 9.998666906466044e-06, + "logits/chosen": -1.9273781776428223, + "logits/rejected": -1.7887461185455322, + "logps/chosen": -429.7943115234375, + "logps/rejected": -284.4364929199219, + "loss": 0.3644, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7208906412124634, + "rewards/margins": 2.020606279373169, + "rewards/rejected": -2.741497039794922, + "step": 328 + }, + { + "epoch": 0.02288138540181521, + "grad_norm": 64.5, + "learning_rate": 9.998658482891938e-06, + "logits/chosen": -2.1510708332061768, + "logits/rejected": -2.0667471885681152, + "logps/chosen": -398.9845275878906, + "logps/rejected": -338.9305114746094, + "loss": 0.4797, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.9103600978851318, + "rewards/margins": 1.7629425525665283, + "rewards/rejected": -2.67330265045166, + "step": 329 + }, + { + "epoch": 0.022950933685711303, + "grad_norm": 18.375, + "learning_rate": 9.998650032791643e-06, + "logits/chosen": -1.720512866973877, + "logits/rejected": -1.5750925540924072, + "logps/chosen": -374.6826171875, + "logps/rejected": -275.42132568359375, + "loss": 0.2712, + "rewards/accuracies": 0.9000000357627869, + "rewards/chosen": 0.019034549593925476, + "rewards/margins": 2.2670435905456543, + "rewards/rejected": -2.248009204864502, + "step": 330 + }, + { + "epoch": 0.0230204819696074, + "grad_norm": 41.75, + "learning_rate": 9.998641556165208e-06, + "logits/chosen": -2.3350253105163574, + "logits/rejected": -2.0491952896118164, + "logps/chosen": -506.7749938964844, + "logps/rejected": -326.72418212890625, + "loss": 0.3783, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7544485330581665, + "rewards/margins": 2.3931283950805664, + "rewards/rejected": -1.6386797428131104, + "step": 331 + }, + { + "epoch": 0.023090030253503493, + "grad_norm": 39.75, + "learning_rate": 9.99863305301267e-06, + "logits/chosen": -2.00767183303833, + "logits/rejected": -1.8835625648498535, + "logps/chosen": -406.19921875, + "logps/rejected": -276.1683349609375, + "loss": 0.5686, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4231150150299072, + "rewards/margins": 1.647965908050537, + "rewards/rejected": -2.0710809230804443, + "step": 332 + }, + { + "epoch": 0.02315957853739959, + "grad_norm": 31.375, + "learning_rate": 9.998624523334079e-06, + "logits/chosen": -2.39424467086792, + "logits/rejected": -2.5254993438720703, + "logps/chosen": -390.3625183105469, + "logps/rejected": -310.3223876953125, + "loss": 0.3307, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.022478386759757996, + "rewards/margins": 2.452648401260376, + "rewards/rejected": -2.4751267433166504, + "step": 333 + }, + { + "epoch": 0.023229126821295683, + "grad_norm": 46.0, + "learning_rate": 9.998615967129482e-06, + "logits/chosen": -2.1730690002441406, + "logits/rejected": -2.4544053077697754, + "logps/chosen": -394.20928955078125, + "logps/rejected": -424.70147705078125, + "loss": 0.52, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.32069435715675354, + "rewards/margins": 1.2936482429504395, + "rewards/rejected": -1.614342451095581, + "step": 334 + }, + { + "epoch": 0.02329867510519178, + "grad_norm": 39.0, + "learning_rate": 9.99860738439892e-06, + "logits/chosen": -2.0129590034484863, + "logits/rejected": -2.267477035522461, + "logps/chosen": -425.21173095703125, + "logps/rejected": -348.4617004394531, + "loss": 0.3757, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.21831288933753967, + "rewards/margins": 2.727863311767578, + "rewards/rejected": -2.946176052093506, + "step": 335 + }, + { + "epoch": 0.023368223389087873, + "grad_norm": 35.5, + "learning_rate": 9.998598775142441e-06, + "logits/chosen": -2.11433744430542, + "logits/rejected": -1.8591749668121338, + "logps/chosen": -410.93487548828125, + "logps/rejected": -305.25830078125, + "loss": 0.385, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7329599261283875, + "rewards/margins": 2.2084593772888184, + "rewards/rejected": -2.9414191246032715, + "step": 336 + }, + { + "epoch": 0.02343777167298397, + "grad_norm": 41.0, + "learning_rate": 9.99859013936009e-06, + "logits/chosen": -1.6955194473266602, + "logits/rejected": -1.8602774143218994, + "logps/chosen": -344.70318603515625, + "logps/rejected": -287.33746337890625, + "loss": 0.4698, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.3858528435230255, + "rewards/margins": 1.474347710609436, + "rewards/rejected": -1.8602005243301392, + "step": 337 + }, + { + "epoch": 0.023507319956880064, + "grad_norm": 42.25, + "learning_rate": 9.998581477051911e-06, + "logits/chosen": -1.6440554857254028, + "logits/rejected": -2.1899595260620117, + "logps/chosen": -293.095947265625, + "logps/rejected": -280.3038330078125, + "loss": 0.5245, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.34018316864967346, + "rewards/margins": 1.236055612564087, + "rewards/rejected": -1.5762388706207275, + "step": 338 + }, + { + "epoch": 0.02357686824077616, + "grad_norm": 38.75, + "learning_rate": 9.998572788217955e-06, + "logits/chosen": -2.076382875442505, + "logits/rejected": -2.072500228881836, + "logps/chosen": -303.3077392578125, + "logps/rejected": -296.952880859375, + "loss": 0.4367, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.14497815072536469, + "rewards/margins": 1.5623383522033691, + "rewards/rejected": -1.4173603057861328, + "step": 339 + }, + { + "epoch": 0.023646416524672254, + "grad_norm": 36.25, + "learning_rate": 9.998564072858265e-06, + "logits/chosen": -1.8519854545593262, + "logits/rejected": -1.7056810855865479, + "logps/chosen": -429.7101745605469, + "logps/rejected": -266.96044921875, + "loss": 0.3239, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.836795449256897, + "rewards/margins": 2.820174217224121, + "rewards/rejected": -1.9833788871765137, + "step": 340 + }, + { + "epoch": 0.02371596480856835, + "grad_norm": 50.0, + "learning_rate": 9.998555330972886e-06, + "logits/chosen": -2.2210910320281982, + "logits/rejected": -2.6007394790649414, + "logps/chosen": -305.9600524902344, + "logps/rejected": -431.8408203125, + "loss": 0.5163, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.5642777681350708, + "rewards/margins": 1.6491864919662476, + "rewards/rejected": -1.0849087238311768, + "step": 341 + }, + { + "epoch": 0.023785513092464444, + "grad_norm": 50.75, + "learning_rate": 9.998546562561867e-06, + "logits/chosen": -2.0178515911102295, + "logits/rejected": -2.0557665824890137, + "logps/chosen": -325.2366638183594, + "logps/rejected": -328.65557861328125, + "loss": 0.5701, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.5287948846817017, + "rewards/margins": 0.8900296688079834, + "rewards/rejected": -0.36123478412628174, + "step": 342 + }, + { + "epoch": 0.023855061376360537, + "grad_norm": 34.5, + "learning_rate": 9.998537767625252e-06, + "logits/chosen": -1.9050045013427734, + "logits/rejected": -1.892833948135376, + "logps/chosen": -371.8611145019531, + "logps/rejected": -256.9468994140625, + "loss": 0.4003, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 1.622910976409912, + "rewards/margins": 2.0016090869903564, + "rewards/rejected": -0.37869811058044434, + "step": 343 + }, + { + "epoch": 0.023924609660256634, + "grad_norm": 49.0, + "learning_rate": 9.99852894616309e-06, + "logits/chosen": -2.2060680389404297, + "logits/rejected": -2.379498243331909, + "logps/chosen": -280.77850341796875, + "logps/rejected": -386.83575439453125, + "loss": 0.5499, + "rewards/accuracies": 0.7000000476837158, + "rewards/chosen": 0.5310922265052795, + "rewards/margins": 1.3629069328308105, + "rewards/rejected": -0.8318146467208862, + "step": 344 + }, + { + "epoch": 0.023994157944152727, + "grad_norm": 43.25, + "learning_rate": 9.998520098175426e-06, + "logits/chosen": -2.000279426574707, + "logits/rejected": -1.9101908206939697, + "logps/chosen": -361.56854248046875, + "logps/rejected": -375.1758117675781, + "loss": 0.4567, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 1.2449432611465454, + "rewards/margins": 1.9816131591796875, + "rewards/rejected": -0.7366699576377869, + "step": 345 + }, + { + "epoch": 0.023994157944152727, + "eval_logits/chosen": -2.109806537628174, + "eval_logits/rejected": -2.0901401042938232, + "eval_logps/chosen": -351.2395935058594, + "eval_logps/rejected": -332.4879455566406, + "eval_loss": 0.44310393929481506, + "eval_rewards/accuracies": 0.7440476417541504, + "eval_rewards/chosen": 1.087096929550171, + "eval_rewards/margins": 1.6102018356323242, + "eval_rewards/rejected": -0.5231050252914429, + "eval_runtime": 58.4858, + "eval_samples_per_second": 2.838, + "eval_steps_per_second": 0.359, + "step": 345 } ], "logging_steps": 1,