diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -12,9 +12,9 @@ "epoch": 0.0, "learning_rate": 8.591065292096219e-10, "logits/chosen": -2.7645795345306396, - "logits/rejected": -2.8125061988830566, + "logits/rejected": -2.8575899600982666, "logps/chosen": -113.67314910888672, - "logps/rejected": -132.0498504638672, + "logps/rejected": -92.74945831298828, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -25,9072 +25,9072 @@ { "epoch": 0.01, "learning_rate": 8.59106529209622e-09, - "logits/chosen": -2.960064649581909, - "logits/rejected": -2.973264694213867, - "logps/chosen": -281.9268798828125, - "logps/rejected": -290.8782958984375, - "loss": 0.693, - "rewards/accuracies": 0.4027777910232544, - "rewards/chosen": -5.9445181250339374e-05, - "rewards/margins": -0.0016730944626033306, - "rewards/rejected": 0.0016136488411575556, + "logits/chosen": -2.955390453338623, + "logits/rejected": -2.9398293495178223, + "logps/chosen": -285.0723876953125, + "logps/rejected": -214.23397827148438, + "loss": 0.6927, + "rewards/accuracies": 0.4861111044883728, + "rewards/chosen": 0.004058043006807566, + "rewards/margins": 0.0012268200516700745, + "rewards/rejected": 0.0028312229551374912, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.718213058419244e-08, - "logits/chosen": -2.9243266582489014, - "logits/rejected": -2.9458937644958496, - "logps/chosen": -219.5768280029297, - "logps/rejected": -229.09963989257812, - "loss": 0.6934, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.009228921495378017, - "rewards/margins": 0.006980190984904766, - "rewards/rejected": 0.0022487309761345387, + "logits/chosen": -2.9239819049835205, + "logits/rejected": -2.934410810470581, + "logps/chosen": -219.6008758544922, + "logps/rejected": -190.2457733154297, + "loss": 0.69, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0068206205032765865, + "rewards/margins": 0.0067012617364525795, + "rewards/rejected": 0.0001193578791571781, "step": 20 }, { "epoch": 0.02, "learning_rate": 2.5773195876288656e-08, - "logits/chosen": -2.918198347091675, - "logits/rejected": -2.9479143619537354, - "logps/chosen": -284.60943603515625, - "logps/rejected": -312.8995056152344, - "loss": 0.6888, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.0206824392080307, - "rewards/margins": 0.003921012859791517, - "rewards/rejected": 0.016761427745223045, + "logits/chosen": -2.9257287979125977, + "logits/rejected": -2.9198484420776367, + "logps/chosen": -284.4122314453125, + "logps/rejected": -242.7172088623047, + "loss": 0.6848, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.037454236298799515, + "rewards/margins": 0.011581240221858025, + "rewards/rejected": 0.02587299607694149, "step": 30 }, { "epoch": 0.02, "learning_rate": 3.436426116838488e-08, - "logits/chosen": -2.8567323684692383, - "logits/rejected": -2.896390438079834, - "logps/chosen": -322.072021484375, - "logps/rejected": -279.9306945800781, - "loss": 0.6811, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.04524571821093559, - "rewards/margins": 0.03146430477499962, - "rewards/rejected": 0.013781411573290825, + "logits/chosen": -2.8577075004577637, + "logits/rejected": -2.9020836353302, + "logps/chosen": -323.2655334472656, + "logps/rejected": -251.8679656982422, + "loss": 0.6734, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.08393758535385132, + "rewards/margins": 0.046039044857025146, + "rewards/rejected": 0.03789854422211647, "step": 40 }, { "epoch": 0.03, "learning_rate": 4.29553264604811e-08, - "logits/chosen": -2.9978153705596924, - "logits/rejected": -3.0320467948913574, - "logps/chosen": -201.69992065429688, - "logps/rejected": -236.61203002929688, - "loss": 0.6587, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.08155672252178192, - "rewards/margins": 0.07839250564575195, - "rewards/rejected": 0.0031642187386751175, + "logits/chosen": -2.9981656074523926, + "logits/rejected": -3.0263190269470215, + "logps/chosen": -201.6113739013672, + "logps/rejected": -192.55258178710938, + "loss": 0.6649, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.09041091054677963, + "rewards/margins": 0.050107330083847046, + "rewards/rejected": 0.04030359536409378, "step": 50 }, { "epoch": 0.03, "learning_rate": 5.154639175257731e-08, - "logits/chosen": -2.8887217044830322, - "logits/rejected": -2.8648734092712402, - "logps/chosen": -250.0323028564453, - "logps/rejected": -308.8505859375, - "loss": 0.6368, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.14510753750801086, - "rewards/margins": 0.12705275416374207, - "rewards/rejected": 0.01805477775633335, + "logits/chosen": -2.8880248069763184, + "logits/rejected": -2.9013853073120117, + "logps/chosen": -249.71340942382812, + "logps/rejected": -265.5022277832031, + "loss": 0.6478, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.17699608206748962, + "rewards/margins": 0.09277067333459854, + "rewards/rejected": 0.08422543108463287, "step": 60 }, { "epoch": 0.04, "learning_rate": 6.013745704467354e-08, - "logits/chosen": -2.95893931388855, - "logits/rejected": -2.941819190979004, - "logps/chosen": -304.4744567871094, - "logps/rejected": -301.7149353027344, - "loss": 0.6367, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.23246800899505615, - "rewards/margins": 0.17572557926177979, - "rewards/rejected": 0.056742388755083084, + "logits/chosen": -2.959083080291748, + "logits/rejected": -2.942816734313965, + "logps/chosen": -309.2194519042969, + "logps/rejected": -261.39276123046875, + "loss": 0.6546, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.2992553114891052, + "rewards/margins": 0.16975775361061096, + "rewards/rejected": 0.12949754297733307, "step": 70 }, { "epoch": 0.04, "learning_rate": 6.872852233676976e-08, - "logits/chosen": -2.955170154571533, - "logits/rejected": -2.9491288661956787, - "logps/chosen": -286.9136047363281, - "logps/rejected": -303.0902099609375, - "loss": 0.5875, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.30379384756088257, - "rewards/margins": 0.26180416345596313, - "rewards/rejected": 0.04198961704969406, + "logits/chosen": -2.9536004066467285, + "logits/rejected": -2.9475512504577637, + "logps/chosen": -286.50103759765625, + "logps/rejected": -245.46908569335938, + "loss": 0.6164, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35541418194770813, + "rewards/margins": 0.25141167640686035, + "rewards/rejected": 0.10400249809026718, "step": 80 }, { "epoch": 0.05, "learning_rate": 7.731958762886598e-08, - "logits/chosen": -2.949690341949463, - "logits/rejected": -3.017688274383545, - "logps/chosen": -249.6548309326172, - "logps/rejected": -293.2023620605469, - "loss": 0.569, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.33933955430984497, - "rewards/margins": 0.3257867693901062, - "rewards/rejected": 0.013552774675190449, + "logits/chosen": -2.9498467445373535, + "logits/rejected": -3.0005860328674316, + "logps/chosen": -256.3730773925781, + "logps/rejected": -235.06076049804688, + "loss": 0.6007, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.3177340626716614, + "rewards/margins": 0.2104119509458542, + "rewards/rejected": 0.10732214152812958, "step": 90 }, { "epoch": 0.05, "learning_rate": 8.59106529209622e-08, - "logits/chosen": -2.8686530590057373, - "logits/rejected": -2.9002976417541504, - "logps/chosen": -276.3162841796875, - "logps/rejected": -236.25296020507812, - "loss": 0.5613, + "logits/chosen": -2.865288257598877, + "logits/rejected": -2.8908486366271973, + "logps/chosen": -276.4228210449219, + "logps/rejected": -202.54351806640625, + "loss": 0.5849, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.4270657002925873, - "rewards/margins": 0.36059075593948364, - "rewards/rejected": 0.06647494435310364, + "rewards/chosen": 0.4164143204689026, + "rewards/margins": 0.3248804807662964, + "rewards/rejected": 0.0915338546037674, "step": 100 }, { "epoch": 0.05, - "eval_logits/chosen": -2.904775619506836, - "eval_logits/rejected": -2.9494807720184326, - "eval_logps/chosen": -243.06394958496094, - "eval_logps/rejected": -275.9722595214844, - "eval_loss": 0.5542330145835876, - "eval_rewards/accuracies": 0.7379999756813049, - "eval_rewards/chosen": 0.46162015199661255, - "eval_rewards/margins": 0.4451034665107727, - "eval_rewards/rejected": 0.01651667058467865, - "eval_runtime": 278.9085, - "eval_samples_per_second": 7.171, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.9063990116119385, + "eval_logits/rejected": -2.9389069080352783, + "eval_logps/chosen": -243.6803436279297, + "eval_logps/rejected": -225.2517852783203, + "eval_loss": 0.5762656927108765, + "eval_rewards/accuracies": 0.7319999933242798, + "eval_rewards/chosen": 0.4134175479412079, + "eval_rewards/margins": 0.3652128577232361, + "eval_rewards/rejected": 0.04820466786623001, + "eval_runtime": 276.1908, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.453, "step": 100 }, { "epoch": 0.06, "learning_rate": 9.450171821305841e-08, - "logits/chosen": -2.873464584350586, - "logits/rejected": -2.9402079582214355, - "logps/chosen": -246.4360809326172, - "logps/rejected": -280.4523620605469, - "loss": 0.5523, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.4688049256801605, - "rewards/margins": 0.5279902219772339, - "rewards/rejected": -0.0591852143406868, + "logits/chosen": -2.873150587081909, + "logits/rejected": -2.932387113571167, + "logps/chosen": -247.15036010742188, + "logps/rejected": -220.5704803466797, + "loss": 0.575, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39737793803215027, + "rewards/margins": 0.42108359932899475, + "rewards/rejected": -0.023705702275037766, "step": 110 }, { "epoch": 0.06, "learning_rate": 1.0309278350515462e-07, - "logits/chosen": -2.9436380863189697, - "logits/rejected": -2.9717044830322266, - "logps/chosen": -210.0995635986328, - "logps/rejected": -235.1615447998047, - "loss": 0.5107, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.4169088900089264, - "rewards/margins": 0.5272529125213623, - "rewards/rejected": -0.1103440523147583, + "logits/chosen": -2.941328287124634, + "logits/rejected": -2.973680019378662, + "logps/chosen": -210.8394317626953, + "logps/rejected": -165.41079711914062, + "loss": 0.5115, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.3431033194065094, + "rewards/margins": 0.5345997214317322, + "rewards/rejected": -0.19149646162986755, "step": 120 }, { "epoch": 0.07, "learning_rate": 1.1168384879725086e-07, - "logits/chosen": -3.001401901245117, - "logits/rejected": -3.032275676727295, - "logps/chosen": -285.7945556640625, - "logps/rejected": -291.1168518066406, - "loss": 0.5105, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.639220118522644, - "rewards/margins": 0.6806271076202393, - "rewards/rejected": -0.04140689969062805, + "logits/chosen": -3.0011961460113525, + "logits/rejected": -3.0338339805603027, + "logps/chosen": -286.9122619628906, + "logps/rejected": -236.15316772460938, + "loss": 0.5166, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.5293468236923218, + "rewards/margins": 0.5481607913970947, + "rewards/rejected": -0.01881396397948265, "step": 130 }, { "epoch": 0.07, "learning_rate": 1.202749140893471e-07, - "logits/chosen": -2.940995931625366, - "logits/rejected": -2.9406614303588867, - "logps/chosen": -285.1832275390625, - "logps/rejected": -329.8865661621094, - "loss": 0.507, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.5072210431098938, - "rewards/margins": 0.8132984042167664, - "rewards/rejected": -0.3060774505138397, + "logits/chosen": -2.938509941101074, + "logits/rejected": -2.9479477405548096, + "logps/chosen": -285.69134521484375, + "logps/rejected": -263.5352783203125, + "loss": 0.4953, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.45640572905540466, + "rewards/margins": 0.7825866341590881, + "rewards/rejected": -0.32618090510368347, "step": 140 }, { "epoch": 0.08, "learning_rate": 1.2886597938144328e-07, - "logits/chosen": -2.9889254570007324, - "logits/rejected": -2.98456072807312, - "logps/chosen": -220.37387084960938, - "logps/rejected": -282.42962646484375, - "loss": 0.4732, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.5785535573959351, - "rewards/margins": 0.9343128204345703, - "rewards/rejected": -0.3557590842247009, + "logits/chosen": -2.987955331802368, + "logits/rejected": -2.988236427307129, + "logps/chosen": -222.3317108154297, + "logps/rejected": -253.3473663330078, + "loss": 0.4691, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.3827708661556244, + "rewards/margins": 0.8588326573371887, + "rewards/rejected": -0.47606176137924194, "step": 150 }, { "epoch": 0.08, "learning_rate": 1.3745704467353952e-07, - "logits/chosen": -2.8972601890563965, - "logits/rejected": -2.9279913902282715, - "logps/chosen": -272.812255859375, - "logps/rejected": -280.38641357421875, - "loss": 0.4883, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.5051306486129761, - "rewards/margins": 0.8609280586242676, - "rewards/rejected": -0.35579735040664673, + "logits/chosen": -2.8914453983306885, + "logits/rejected": -2.9082510471343994, + "logps/chosen": -274.43707275390625, + "logps/rejected": -215.9786834716797, + "loss": 0.4663, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.3427375853061676, + "rewards/margins": 0.8972614407539368, + "rewards/rejected": -0.5545238256454468, "step": 160 }, { "epoch": 0.09, "learning_rate": 1.4604810996563573e-07, - "logits/chosen": -2.9837818145751953, - "logits/rejected": -2.9798386096954346, - "logps/chosen": -236.0260009765625, - "logps/rejected": -272.0481262207031, - "loss": 0.426, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.6715840101242065, - "rewards/margins": 1.2586230039596558, - "rewards/rejected": -0.5870389938354492, + "logits/chosen": -2.9795353412628174, + "logits/rejected": -2.9609813690185547, + "logps/chosen": -238.555419921875, + "logps/rejected": -200.61390686035156, + "loss": 0.4125, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.41908708214759827, + "rewards/margins": 1.2952790260314941, + "rewards/rejected": -0.8761919736862183, "step": 170 }, { "epoch": 0.09, "learning_rate": 1.5463917525773197e-07, - "logits/chosen": -2.9777297973632812, - "logits/rejected": -2.9535796642303467, - "logps/chosen": -179.25314331054688, - "logps/rejected": -235.5211944580078, - "loss": 0.4372, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.6480907797813416, - "rewards/margins": 0.9972507357597351, - "rewards/rejected": -0.3491598665714264, + "logits/chosen": -2.981524705886841, + "logits/rejected": -2.944304943084717, + "logps/chosen": -181.98141479492188, + "logps/rejected": -214.07400512695312, + "loss": 0.4121, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.3756888508796692, + "rewards/margins": 1.067429780960083, + "rewards/rejected": -0.691740870475769, "step": 180 }, { "epoch": 0.1, "learning_rate": 1.6323024054982818e-07, - "logits/chosen": -2.9201600551605225, - "logits/rejected": -2.931079626083374, - "logps/chosen": -259.4476013183594, - "logps/rejected": -317.50244140625, - "loss": 0.4526, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.7379086017608643, - "rewards/margins": 1.142943263053894, - "rewards/rejected": -0.4050346910953522, + "logits/chosen": -2.9182937145233154, + "logits/rejected": -2.939922332763672, + "logps/chosen": -260.34075927734375, + "logps/rejected": -287.94464111328125, + "loss": 0.3929, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.6485947966575623, + "rewards/margins": 1.2688244581222534, + "rewards/rejected": -0.6202298402786255, "step": 190 }, { "epoch": 0.1, "learning_rate": 1.718213058419244e-07, - "logits/chosen": -2.8957831859588623, - "logits/rejected": -2.929515838623047, - "logps/chosen": -302.5355224609375, - "logps/rejected": -221.0574951171875, - "loss": 0.4215, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.5478682518005371, - "rewards/margins": 1.2445906400680542, - "rewards/rejected": -0.6967223882675171, + "logits/chosen": -2.8991127014160156, + "logits/rejected": -2.8910486698150635, + "logps/chosen": -303.3644104003906, + "logps/rejected": -164.12216186523438, + "loss": 0.3703, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.4664244055747986, + "rewards/margins": 1.6392621994018555, + "rewards/rejected": -1.1728379726409912, "step": 200 }, { "epoch": 0.1, - "eval_logits/chosen": -2.8915369510650635, - "eval_logits/rejected": -2.9388110637664795, - "eval_logps/chosen": -242.70468139648438, - "eval_logps/rejected": -283.1268310546875, - "eval_loss": 0.46274256706237793, - "eval_rewards/accuracies": 0.7839999794960022, - "eval_rewards/chosen": 0.4975453019142151, - "eval_rewards/margins": 1.1964877843856812, - "eval_rewards/rejected": -0.6989425420761108, - "eval_runtime": 278.9272, - "eval_samples_per_second": 7.17, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.8993499279022217, + "eval_logits/rejected": -2.9371559619903564, + "eval_logps/chosen": -244.9145050048828, + "eval_logps/rejected": -237.05264282226562, + "eval_loss": 0.4118317663669586, + "eval_rewards/accuracies": 0.7900000214576721, + "eval_rewards/chosen": 0.2899988889694214, + "eval_rewards/margins": 1.4218796491622925, + "eval_rewards/rejected": -1.131880760192871, + "eval_runtime": 276.0867, + "eval_samples_per_second": 7.244, + "eval_steps_per_second": 0.453, "step": 200 }, { "epoch": 0.11, "learning_rate": 1.804123711340206e-07, - "logits/chosen": -2.8694968223571777, - "logits/rejected": -2.865325689315796, - "logps/chosen": -251.77734375, - "logps/rejected": -315.6266174316406, - "loss": 0.4789, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.41053009033203125, - "rewards/margins": 1.1092506647109985, - "rewards/rejected": -0.6987205743789673, + "logits/chosen": -2.872159957885742, + "logits/rejected": -2.879067897796631, + "logps/chosen": -254.1824188232422, + "logps/rejected": -269.99847412109375, + "loss": 0.4438, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.1703362762928009, + "rewards/margins": 1.2109404802322388, + "rewards/rejected": -1.0406041145324707, "step": 210 }, { "epoch": 0.11, "learning_rate": 1.8900343642611682e-07, - "logits/chosen": -2.955479145050049, - "logits/rejected": -2.920375108718872, - "logps/chosen": -290.3009338378906, - "logps/rejected": -279.76873779296875, - "loss": 0.4596, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.48892465233802795, - "rewards/margins": 1.2118638753890991, - "rewards/rejected": -0.7229393720626831, + "logits/chosen": -2.9596405029296875, + "logits/rejected": -2.931642532348633, + "logps/chosen": -292.19708251953125, + "logps/rejected": -237.60208129882812, + "loss": 0.3622, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.2993105947971344, + "rewards/margins": 1.6589787006378174, + "rewards/rejected": -1.3596680164337158, "step": 220 }, { "epoch": 0.12, "learning_rate": 1.9759450171821303e-07, - "logits/chosen": -2.9636857509613037, - "logits/rejected": -2.9484169483184814, - "logps/chosen": -257.77325439453125, - "logps/rejected": -290.58416748046875, - "loss": 0.4495, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.5090667009353638, - "rewards/margins": 1.1924711465835571, - "rewards/rejected": -0.6834043264389038, + "logits/chosen": -2.9739303588867188, + "logits/rejected": -2.971005439758301, + "logps/chosen": -260.6449279785156, + "logps/rejected": -226.1801300048828, + "loss": 0.3656, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.22189848124980927, + "rewards/margins": 1.5202922821044922, + "rewards/rejected": -1.2983938455581665, "step": 230 }, { "epoch": 0.12, "learning_rate": 2.0618556701030925e-07, - "logits/chosen": -2.9927587509155273, - "logits/rejected": -2.982868194580078, - "logps/chosen": -300.302978515625, - "logps/rejected": -264.7530212402344, - "loss": 0.4626, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.48255831003189087, - "rewards/margins": 1.3429360389709473, - "rewards/rejected": -0.8603779077529907, + "logits/chosen": -3.0024609565734863, + "logits/rejected": -2.982093334197998, + "logps/chosen": -303.9729309082031, + "logps/rejected": -220.65725708007812, + "loss": 0.4173, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.11385570466518402, + "rewards/margins": 1.6870304346084595, + "rewards/rejected": -1.5731747150421143, "step": 240 }, { "epoch": 0.13, "learning_rate": 2.1477663230240549e-07, - "logits/chosen": -3.0206644535064697, - "logits/rejected": -3.019110679626465, - "logps/chosen": -266.93206787109375, - "logps/rejected": -293.4591369628906, - "loss": 0.4382, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.3956943452358246, - "rewards/margins": 1.2525049448013306, - "rewards/rejected": -0.8568106889724731, + "logits/chosen": -3.0265252590179443, + "logits/rejected": -3.0173137187957764, + "logps/chosen": -269.44415283203125, + "logps/rejected": -236.6900634765625, + "loss": 0.3689, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.1444779932498932, + "rewards/margins": 1.6794666051864624, + "rewards/rejected": -1.5349886417388916, "step": 250 }, { "epoch": 0.13, "learning_rate": 2.2336769759450173e-07, - "logits/chosen": -3.004024028778076, - "logits/rejected": -2.9846959114074707, - "logps/chosen": -276.8220520019531, - "logps/rejected": -279.55718994140625, - "loss": 0.423, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.7008559703826904, - "rewards/margins": 1.6333271265029907, - "rewards/rejected": -0.932470977306366, + "logits/chosen": -3.0136585235595703, + "logits/rejected": -3.0067028999328613, + "logps/chosen": -280.31951904296875, + "logps/rejected": -230.7751007080078, + "loss": 0.3423, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.35110995173454285, + "rewards/margins": 2.2199208736419678, + "rewards/rejected": -1.8688112497329712, "step": 260 }, { "epoch": 0.14, "learning_rate": 2.3195876288659794e-07, - "logits/chosen": -2.964247941970825, - "logits/rejected": -3.0104923248291016, - "logps/chosen": -233.50796508789062, - "logps/rejected": -284.2912902832031, - "loss": 0.3909, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.5938805341720581, - "rewards/margins": 1.5185127258300781, - "rewards/rejected": -0.92463219165802, + "logits/chosen": -2.9820873737335205, + "logits/rejected": -3.0196051597595215, + "logps/chosen": -249.1240234375, + "logps/rejected": -242.77761840820312, + "loss": 0.3224, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.3122195899486542, + "rewards/margins": 1.9349424839019775, + "rewards/rejected": -1.622722864151001, "step": 270 }, { "epoch": 0.14, "learning_rate": 2.405498281786942e-07, - "logits/chosen": -2.899641275405884, - "logits/rejected": -2.943345546722412, - "logps/chosen": -300.43194580078125, - "logps/rejected": -302.929931640625, - "loss": 0.4353, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.38024798035621643, - "rewards/margins": 1.3194282054901123, - "rewards/rejected": -0.9391803741455078, + "logits/chosen": -2.9121787548065186, + "logits/rejected": -2.9569742679595947, + "logps/chosen": -302.5876770019531, + "logps/rejected": -269.4565734863281, + "loss": 0.3591, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1645125150680542, + "rewards/margins": 1.677587866783142, + "rewards/rejected": -1.5130754709243774, "step": 280 }, { "epoch": 0.15, "learning_rate": 2.4914089347079036e-07, - "logits/chosen": -2.897779941558838, - "logits/rejected": -2.932300090789795, - "logps/chosen": -289.35992431640625, - "logps/rejected": -324.7225341796875, - "loss": 0.4472, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.767623782157898, - "rewards/margins": 1.861802101135254, - "rewards/rejected": -1.0941781997680664, + "logits/chosen": -2.909233570098877, + "logits/rejected": -2.9327633380889893, + "logps/chosen": -293.98834228515625, + "logps/rejected": -249.46749877929688, + "loss": 0.3332, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.6848534941673279, + "rewards/margins": 2.8158535957336426, + "rewards/rejected": -2.13100004196167, "step": 290 }, { "epoch": 0.15, "learning_rate": 2.5773195876288655e-07, - "logits/chosen": -2.9786391258239746, - "logits/rejected": -2.98624849319458, - "logps/chosen": -258.1387634277344, - "logps/rejected": -287.59051513671875, - "loss": 0.4508, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.4574419856071472, - "rewards/margins": 1.4439876079559326, - "rewards/rejected": -0.9865456819534302, + "logits/chosen": -2.986581325531006, + "logits/rejected": -3.000779628753662, + "logps/chosen": -261.3474426269531, + "logps/rejected": -245.00784301757812, + "loss": 0.4041, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.13657422363758087, + "rewards/margins": 1.9374473094940186, + "rewards/rejected": -1.800873041152954, "step": 300 }, { "epoch": 0.15, - "eval_logits/chosen": -2.9006266593933105, - "eval_logits/rejected": -2.951172351837158, - "eval_logps/chosen": -243.17063903808594, - "eval_logps/rejected": -287.9976806640625, - "eval_loss": 0.47074779868125916, - "eval_rewards/accuracies": 0.7839999794960022, - "eval_rewards/chosen": 0.4509522318840027, - "eval_rewards/margins": 1.6369801759719849, - "eval_rewards/rejected": -1.1860281229019165, - "eval_runtime": 278.6174, - "eval_samples_per_second": 7.178, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.9179325103759766, + "eval_logits/rejected": -2.955073356628418, + "eval_logps/chosen": -246.43472290039062, + "eval_logps/rejected": -244.5528564453125, + "eval_loss": 0.4333568811416626, + "eval_rewards/accuracies": 0.8180000185966492, + "eval_rewards/chosen": 0.13797786831855774, + "eval_rewards/margins": 2.019880533218384, + "eval_rewards/rejected": -1.881902813911438, + "eval_runtime": 276.3633, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 0.452, "step": 300 }, { "epoch": 0.16, "learning_rate": 2.663230240549828e-07, - "logits/chosen": -3.018040180206299, - "logits/rejected": -3.0253515243530273, - "logps/chosen": -266.28204345703125, - "logps/rejected": -265.2425842285156, - "loss": 0.437, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.318104088306427, - "rewards/margins": 1.3584096431732178, - "rewards/rejected": -1.040305733680725, + "logits/chosen": -3.029100179672241, + "logits/rejected": -3.0314645767211914, + "logps/chosen": -268.97039794921875, + "logps/rejected": -217.917236328125, + "loss": 0.3558, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.04926682263612747, + "rewards/margins": 1.9569294452667236, + "rewards/rejected": -1.9076626300811768, "step": 310 }, { "epoch": 0.17, "learning_rate": 2.7491408934707903e-07, - "logits/chosen": -2.9877212047576904, - "logits/rejected": -2.993577241897583, - "logps/chosen": -281.7521057128906, - "logps/rejected": -267.09228515625, - "loss": 0.4209, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.4347918927669525, - "rewards/margins": 1.756643295288086, - "rewards/rejected": -1.3218514919281006, + "logits/chosen": -2.994835615158081, + "logits/rejected": -3.0310540199279785, + "logps/chosen": -282.8102722167969, + "logps/rejected": -211.141357421875, + "loss": 0.3329, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.3289766311645508, + "rewards/margins": 2.667945623397827, + "rewards/rejected": -2.3389692306518555, "step": 320 }, { "epoch": 0.17, "learning_rate": 2.835051546391752e-07, - "logits/chosen": -2.974451780319214, - "logits/rejected": -2.9852652549743652, - "logps/chosen": -280.6615295410156, - "logps/rejected": -287.22967529296875, - "loss": 0.346, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.6992789506912231, - "rewards/margins": 2.1831793785095215, - "rewards/rejected": -1.483900547027588, + "logits/chosen": -2.9849166870117188, + "logits/rejected": -2.9961228370666504, + "logps/chosen": -282.36614990234375, + "logps/rejected": -262.4176330566406, + "loss": 0.3005, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.5268134474754333, + "rewards/margins": 2.4909260272979736, + "rewards/rejected": -1.9641125202178955, "step": 330 }, { "epoch": 0.18, "learning_rate": 2.9209621993127146e-07, - "logits/chosen": -2.8814268112182617, - "logits/rejected": -2.9174575805664062, - "logps/chosen": -279.43878173828125, - "logps/rejected": -296.05908203125, - "loss": 0.435, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.5694992542266846, - "rewards/margins": 1.738724708557129, - "rewards/rejected": -1.1692254543304443, + "logits/chosen": -2.890103816986084, + "logits/rejected": -2.9303138256073, + "logps/chosen": -281.1314697265625, + "logps/rejected": -257.63189697265625, + "loss": 0.365, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.40147724747657776, + "rewards/margins": 2.1507370471954346, + "rewards/rejected": -1.7492597103118896, "step": 340 }, { "epoch": 0.18, "learning_rate": 3.006872852233677e-07, - "logits/chosen": -2.994208335876465, - "logits/rejected": -2.974083662033081, - "logps/chosen": -204.75537109375, - "logps/rejected": -272.02471923828125, - "loss": 0.3807, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.33386245369911194, - "rewards/margins": 1.8207858800888062, - "rewards/rejected": -1.4869236946105957, + "logits/chosen": -2.9978394508361816, + "logits/rejected": -2.974975347518921, + "logps/chosen": -206.7707977294922, + "logps/rejected": -221.951416015625, + "loss": 0.3081, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.12175428867340088, + "rewards/margins": 2.2502267360687256, + "rewards/rejected": -2.128472328186035, "step": 350 }, { "epoch": 0.19, "learning_rate": 3.0927835051546394e-07, - "logits/chosen": -2.9840168952941895, - "logits/rejected": -3.0060067176818848, - "logps/chosen": -230.21176147460938, - "logps/rejected": -264.5437316894531, - "loss": 0.4356, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.22637569904327393, - "rewards/margins": 1.9646222591400146, - "rewards/rejected": -1.7382465600967407, + "logits/chosen": -2.9994003772735596, + "logits/rejected": -3.0118205547332764, + "logps/chosen": -239.62411499023438, + "logps/rejected": -210.3192901611328, + "loss": 0.3334, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.1213647872209549, + "rewards/margins": 2.623220920562744, + "rewards/rejected": -2.5018563270568848, "step": 360 }, { "epoch": 0.19, "learning_rate": 3.178694158075601e-07, - "logits/chosen": -2.993978500366211, - "logits/rejected": -2.9822306632995605, - "logps/chosen": -232.2978057861328, - "logps/rejected": -236.8600616455078, - "loss": 0.4206, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.44667625427246094, - "rewards/margins": 2.0350661277770996, - "rewards/rejected": -1.5883899927139282, + "logits/chosen": -3.008749008178711, + "logits/rejected": -2.999889612197876, + "logps/chosen": -237.5744171142578, + "logps/rejected": -204.9453125, + "loss": 0.336, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.27118250727653503, + "rewards/margins": 3.0523250102996826, + "rewards/rejected": -2.7811427116394043, "step": 370 }, { "epoch": 0.2, "learning_rate": 3.2646048109965636e-07, - "logits/chosen": -2.9275434017181396, - "logits/rejected": -2.952369213104248, - "logps/chosen": -182.41514587402344, - "logps/rejected": -267.49188232421875, - "loss": 0.5245, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.6680549383163452, - "rewards/margins": 2.175363302230835, - "rewards/rejected": -1.5073082447052002, + "logits/chosen": -2.9313995838165283, + "logits/rejected": -2.9716029167175293, + "logps/chosen": -184.48663330078125, + "logps/rejected": -233.38485717773438, + "loss": 0.4675, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.46090951561927795, + "rewards/margins": 2.5504894256591797, + "rewards/rejected": -2.0895798206329346, "step": 380 }, { "epoch": 0.2, "learning_rate": 3.3505154639175255e-07, - "logits/chosen": -3.0049033164978027, - "logits/rejected": -3.054906129837036, - "logps/chosen": -211.040771484375, - "logps/rejected": -257.2097473144531, - "loss": 0.4087, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.5466452836990356, - "rewards/margins": 2.268761157989502, - "rewards/rejected": -1.7221157550811768, + "logits/chosen": -2.9859910011291504, + "logits/rejected": -3.0315299034118652, + "logps/chosen": -217.3363800048828, + "logps/rejected": -205.7701416015625, + "loss": 0.3563, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.40869975090026855, + "rewards/margins": 3.200533390045166, + "rewards/rejected": -2.7918336391448975, "step": 390 }, { "epoch": 0.21, "learning_rate": 3.436426116838488e-07, - "logits/chosen": -3.062668561935425, - "logits/rejected": -3.073979616165161, - "logps/chosen": -220.09646606445312, - "logps/rejected": -228.078125, - "loss": 0.5348, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.29860109090805054, - "rewards/margins": 1.6728731393814087, - "rewards/rejected": -1.374272108078003, + "logits/chosen": -3.031710147857666, + "logits/rejected": -3.041163921356201, + "logps/chosen": -222.6304168701172, + "logps/rejected": -204.8536834716797, + "loss": 0.3508, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.04648579657077789, + "rewards/margins": 2.636461019515991, + "rewards/rejected": -2.589975118637085, "step": 400 }, { "epoch": 0.21, - "eval_logits/chosen": -2.956052780151367, - "eval_logits/rejected": -3.00534725189209, - "eval_logps/chosen": -244.32916259765625, - "eval_logps/rejected": -293.5365295410156, - "eval_loss": 0.47088953852653503, - "eval_rewards/accuracies": 0.8040000200271606, - "eval_rewards/chosen": 0.3350996673107147, - "eval_rewards/margins": 2.075010299682617, - "eval_rewards/rejected": -1.7399104833602905, - "eval_runtime": 278.6522, - "eval_samples_per_second": 7.177, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.942204236984253, + "eval_logits/rejected": -2.9765143394470215, + "eval_logps/chosen": -245.168701171875, + "eval_logps/rejected": -251.80572509765625, + "eval_loss": 0.3957485854625702, + "eval_rewards/accuracies": 0.843999981880188, + "eval_rewards/chosen": 0.2645789086818695, + "eval_rewards/margins": 2.8717663288116455, + "eval_rewards/rejected": -2.607187509536743, + "eval_runtime": 276.0326, + "eval_samples_per_second": 7.246, + "eval_steps_per_second": 0.453, "step": 400 }, { "epoch": 0.21, "learning_rate": 3.5223367697594503e-07, - "logits/chosen": -2.9030888080596924, - "logits/rejected": -2.903963804244995, - "logps/chosen": -218.80093383789062, - "logps/rejected": -270.7477111816406, - "loss": 0.3946, + "logits/chosen": -2.8807833194732666, + "logits/rejected": -2.880128860473633, + "logps/chosen": -220.51565551757812, + "logps/rejected": -253.80551147460938, + "loss": 0.3045, "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.5591822862625122, - "rewards/margins": 2.3011364936828613, - "rewards/rejected": -1.7419544458389282, + "rewards/chosen": 0.387712687253952, + "rewards/margins": 2.6848089694976807, + "rewards/rejected": -2.2970964908599854, "step": 410 }, { "epoch": 0.22, "learning_rate": 3.608247422680412e-07, - "logits/chosen": -2.9065468311309814, - "logits/rejected": -2.909379482269287, - "logps/chosen": -281.303466796875, - "logps/rejected": -283.3905029296875, - "loss": 0.4069, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.30885323882102966, - "rewards/margins": 2.3154749870300293, - "rewards/rejected": -2.006621837615967, + "logits/chosen": -2.876032829284668, + "logits/rejected": -2.8481011390686035, + "logps/chosen": -284.7151184082031, + "logps/rejected": -234.76937866210938, + "loss": 0.2913, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.24851512908935547, + "rewards/margins": 3.5443179607391357, + "rewards/rejected": -3.295802593231201, "step": 420 }, { "epoch": 0.22, "learning_rate": 3.6941580756013745e-07, - "logits/chosen": -3.0179543495178223, - "logits/rejected": -2.9964187145233154, - "logps/chosen": -246.273193359375, - "logps/rejected": -244.64932250976562, - "loss": 0.4263, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.045947521924972534, - "rewards/margins": 1.8498051166534424, - "rewards/rejected": -1.8957529067993164, + "logits/chosen": -2.9894893169403076, + "logits/rejected": -2.9477481842041016, + "logps/chosen": -244.4584503173828, + "logps/rejected": -198.04327392578125, + "loss": 0.336, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.13636508584022522, + "rewards/margins": 3.013824462890625, + "rewards/rejected": -2.8774592876434326, "step": 430 }, { "epoch": 0.23, "learning_rate": 3.7800687285223364e-07, - "logits/chosen": -2.9630212783813477, - "logits/rejected": -3.0045969486236572, - "logps/chosen": -257.0661926269531, - "logps/rejected": -351.8866271972656, - "loss": 0.4483, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.05867958813905716, - "rewards/margins": 1.8917491436004639, - "rewards/rejected": -1.8330695629119873, + "logits/chosen": -2.926450490951538, + "logits/rejected": -2.977935552597046, + "logps/chosen": -257.2900390625, + "logps/rejected": -301.8376770019531, + "loss": 0.3746, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.036764394491910934, + "rewards/margins": 2.0472729206085205, + "rewards/rejected": -2.0105087757110596, "step": 440 }, { "epoch": 0.23, "learning_rate": 3.865979381443299e-07, - "logits/chosen": -3.0662550926208496, - "logits/rejected": -3.0640132427215576, - "logps/chosen": -228.62112426757812, - "logps/rejected": -314.25103759765625, - "loss": 0.4252, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.3938661217689514, - "rewards/margins": 1.8685601949691772, - "rewards/rejected": -1.474694013595581, + "logits/chosen": -3.0555129051208496, + "logits/rejected": -3.069122791290283, + "logps/chosen": -227.95889282226562, + "logps/rejected": -293.7567443847656, + "loss": 0.3255, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.460088312625885, + "rewards/margins": 2.640362501144409, + "rewards/rejected": -2.180274486541748, "step": 450 }, { "epoch": 0.24, "learning_rate": 3.9518900343642607e-07, - "logits/chosen": -3.0231873989105225, - "logits/rejected": -3.001800775527954, - "logps/chosen": -251.1888885498047, - "logps/rejected": -312.30120849609375, - "loss": 0.5593, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.4971710741519928, - "rewards/margins": 2.303713321685791, - "rewards/rejected": -1.8065423965454102, + "logits/chosen": -3.0144412517547607, + "logits/rejected": -2.9838833808898926, + "logps/chosen": -252.33584594726562, + "logps/rejected": -271.2647705078125, + "loss": 0.3863, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.3810385763645172, + "rewards/margins": 3.133136034011841, + "rewards/rejected": -2.7520978450775146, "step": 460 }, { "epoch": 0.24, "learning_rate": 4.037800687285223e-07, - "logits/chosen": -3.061251401901245, - "logits/rejected": -3.065091848373413, - "logps/chosen": -299.6525573730469, - "logps/rejected": -242.2359619140625, - "loss": 0.4287, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.43876272439956665, - "rewards/margins": 2.200730085372925, - "rewards/rejected": -1.761967420578003, + "logits/chosen": -3.059541940689087, + "logits/rejected": -3.0535807609558105, + "logps/chosen": -301.24578857421875, + "logps/rejected": -194.94271850585938, + "loss": 0.3063, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.1833353340625763, + "rewards/margins": 3.5823638439178467, + "rewards/rejected": -3.3990283012390137, "step": 470 }, { "epoch": 0.25, "learning_rate": 4.123711340206185e-07, - "logits/chosen": -3.0854439735412598, - "logits/rejected": -3.067539691925049, - "logps/chosen": -296.04217529296875, - "logps/rejected": -291.0183410644531, - "loss": 0.388, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.09810693562030792, - "rewards/margins": 1.5580971240997314, - "rewards/rejected": -1.4599902629852295, + "logits/chosen": -3.071159839630127, + "logits/rejected": -3.0471129417419434, + "logps/chosen": -298.99078369140625, + "logps/rejected": -258.37957763671875, + "loss": 0.3268, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1941816359758377, + "rewards/margins": 2.6484832763671875, + "rewards/rejected": -2.842665195465088, "step": 480 }, { "epoch": 0.25, "learning_rate": 4.209621993127148e-07, - "logits/chosen": -2.9323372840881348, - "logits/rejected": -2.976844310760498, - "logps/chosen": -282.63629150390625, - "logps/rejected": -290.7026062011719, - "loss": 0.4349, + "logits/chosen": -2.898244619369507, + "logits/rejected": -2.9433770179748535, + "logps/chosen": -285.5610046386719, + "logps/rejected": -246.0979461669922, + "loss": 0.3612, "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.22253699600696564, - "rewards/margins": 1.85140061378479, - "rewards/rejected": -1.6288635730743408, + "rewards/chosen": 0.12167198956012726, + "rewards/margins": 2.858144760131836, + "rewards/rejected": -2.7364726066589355, "step": 490 }, { "epoch": 0.26, "learning_rate": 4.2955326460481097e-07, - "logits/chosen": -3.1376662254333496, - "logits/rejected": -3.1647543907165527, - "logps/chosen": -230.1941375732422, - "logps/rejected": -306.46929931640625, - "loss": 0.4742, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.3802974820137024, - "rewards/margins": 2.1001405715942383, - "rewards/rejected": -1.7198429107666016, + "logits/chosen": -3.1096904277801514, + "logits/rejected": -3.119216203689575, + "logps/chosen": -231.02804565429688, + "logps/rejected": -261.2414855957031, + "loss": 0.3785, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2965603172779083, + "rewards/margins": 3.2850193977355957, + "rewards/rejected": -2.9884591102600098, "step": 500 }, { "epoch": 0.26, - "eval_logits/chosen": -3.0499820709228516, - "eval_logits/rejected": -3.1011428833007812, - "eval_logps/chosen": -243.72792053222656, - "eval_logps/rejected": -294.0814208984375, - "eval_loss": 0.5065268874168396, - "eval_rewards/accuracies": 0.8220000267028809, - "eval_rewards/chosen": 0.39522290229797363, - "eval_rewards/margins": 2.1896207332611084, - "eval_rewards/rejected": -1.7943980693817139, - "eval_runtime": 278.4037, - "eval_samples_per_second": 7.184, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -3.0375795364379883, + "eval_logits/rejected": -3.0745201110839844, + "eval_logps/chosen": -243.3692169189453, + "eval_logps/rejected": -252.08010864257812, + "eval_loss": 0.3716643750667572, + "eval_rewards/accuracies": 0.8420000076293945, + "eval_rewards/chosen": 0.4445287585258484, + "eval_rewards/margins": 3.0791571140289307, + "eval_rewards/rejected": -2.6346285343170166, + "eval_runtime": 275.9498, + "eval_samples_per_second": 7.248, + "eval_steps_per_second": 0.453, "step": 500 }, { "epoch": 0.26, "learning_rate": 4.381443298969072e-07, - "logits/chosen": -3.00343918800354, - "logits/rejected": -3.095369815826416, - "logps/chosen": -258.6968994140625, - "logps/rejected": -292.36102294921875, - "loss": 0.573, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.032131295651197433, - "rewards/margins": 1.5518535375595093, - "rewards/rejected": -1.5197222232818604, + "logits/chosen": -3.0041661262512207, + "logits/rejected": -3.066612720489502, + "logps/chosen": -260.65753173828125, + "logps/rejected": -258.9786071777344, + "loss": 0.4774, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.16205310821533203, + "rewards/margins": 2.2974131107330322, + "rewards/rejected": -2.459465980529785, "step": 510 }, { "epoch": 0.27, "learning_rate": 4.4673539518900345e-07, - "logits/chosen": -3.037055730819702, - "logits/rejected": -3.0623817443847656, - "logps/chosen": -223.7424774169922, - "logps/rejected": -250.5542449951172, - "loss": 0.5577, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.20696468651294708, - "rewards/margins": 1.9794868230819702, - "rewards/rejected": -2.1864516735076904, + "logits/chosen": -3.048417806625366, + "logits/rejected": -3.0493385791778564, + "logps/chosen": -222.28921508789062, + "logps/rejected": -202.9162139892578, + "loss": 0.3127, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.0617719367146492, + "rewards/margins": 4.143820762634277, + "rewards/rejected": -4.205592632293701, "step": 520 }, { "epoch": 0.27, "learning_rate": 4.5532646048109964e-07, - "logits/chosen": -3.029510021209717, - "logits/rejected": -3.048191547393799, - "logps/chosen": -239.50729370117188, - "logps/rejected": -269.36920166015625, - "loss": 0.4672, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.13861794769763947, - "rewards/margins": 2.154052257537842, - "rewards/rejected": -2.0154342651367188, + "logits/chosen": -3.016369104385376, + "logits/rejected": -3.033298969268799, + "logps/chosen": -241.38107299804688, + "logps/rejected": -238.4398956298828, + "loss": 0.3222, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.04746321588754654, + "rewards/margins": 3.2291030883789062, + "rewards/rejected": -3.2765660285949707, "step": 530 }, { "epoch": 0.28, "learning_rate": 4.639175257731959e-07, - "logits/chosen": -3.0581881999969482, - "logits/rejected": -3.0828652381896973, - "logps/chosen": -284.1593933105469, - "logps/rejected": -302.8993225097656, - "loss": 0.5169, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.19171719253063202, - "rewards/margins": 1.9804853200912476, - "rewards/rejected": -1.7887680530548096, + "logits/chosen": -3.0321764945983887, + "logits/rejected": -3.0770981311798096, + "logps/chosen": -287.4913330078125, + "logps/rejected": -230.2133331298828, + "loss": 0.2848, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.141477569937706, + "rewards/margins": 3.4585938453674316, + "rewards/rejected": -3.600071430206299, "step": 540 }, { "epoch": 0.28, "learning_rate": 4.7250859106529206e-07, - "logits/chosen": -3.065326452255249, - "logits/rejected": -3.0628104209899902, - "logps/chosen": -265.96173095703125, - "logps/rejected": -304.02923583984375, - "loss": 0.5727, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.19634675979614258, - "rewards/margins": 2.1012516021728516, - "rewards/rejected": -1.9049047231674194, + "logits/chosen": -3.048055410385132, + "logits/rejected": -3.0591869354248047, + "logps/chosen": -268.286376953125, + "logps/rejected": -281.8370361328125, + "loss": 0.4775, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.03612400218844414, + "rewards/margins": 3.5109527111053467, + "rewards/rejected": -3.54707670211792, "step": 550 }, { "epoch": 0.29, "learning_rate": 4.810996563573884e-07, - "logits/chosen": -3.045553684234619, - "logits/rejected": -3.0148532390594482, - "logps/chosen": -308.58282470703125, - "logps/rejected": -330.0618896484375, - "loss": 0.4386, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.14334824681282043, - "rewards/margins": 2.13740873336792, - "rewards/rejected": -1.994060754776001, + "logits/chosen": -2.9997217655181885, + "logits/rejected": -2.946760416030884, + "logps/chosen": -308.07403564453125, + "logps/rejected": -301.5303039550781, + "loss": 0.3241, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.19681063294410706, + "rewards/margins": 3.283550262451172, + "rewards/rejected": -3.086740016937256, "step": 560 }, { "epoch": 0.29, "learning_rate": 4.896907216494845e-07, - "logits/chosen": -3.091966152191162, - "logits/rejected": -3.092928886413574, - "logps/chosen": -207.5926513671875, - "logps/rejected": -312.8612365722656, - "loss": 0.4463, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.2711809277534485, - "rewards/margins": 2.29288911819458, - "rewards/rejected": -2.0217084884643555, + "logits/chosen": -3.0523531436920166, + "logits/rejected": -3.0602190494537354, + "logps/chosen": -204.9850311279297, + "logps/rejected": -271.73443603515625, + "loss": 0.3887, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.5319434404373169, + "rewards/margins": 3.667424440383911, + "rewards/rejected": -3.1354808807373047, "step": 570 }, { "epoch": 0.3, "learning_rate": 4.982817869415807e-07, - "logits/chosen": -3.1156067848205566, - "logits/rejected": -3.081942081451416, - "logps/chosen": -261.8205871582031, - "logps/rejected": -246.6261444091797, - "loss": 0.5618, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.352669894695282, - "rewards/margins": 2.471661329269409, - "rewards/rejected": -2.1189913749694824, + "logits/chosen": -3.068105936050415, + "logits/rejected": -3.0021653175354004, + "logps/chosen": -258.40533447265625, + "logps/rejected": -200.6361541748047, + "loss": 0.3927, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.694198489189148, + "rewards/margins": 5.009927749633789, + "rewards/rejected": -4.31572961807251, "step": 580 }, { "epoch": 0.3, "learning_rate": 4.992350353796136e-07, - "logits/chosen": -3.011913776397705, - "logits/rejected": -3.0575170516967773, - "logps/chosen": -221.0374298095703, - "logps/rejected": -290.2525634765625, - "loss": 0.4376, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.34248480200767517, - "rewards/margins": 2.1471829414367676, - "rewards/rejected": -1.8046982288360596, + "logits/chosen": -2.987105131149292, + "logits/rejected": -3.039296865463257, + "logps/chosen": -226.6549072265625, + "logps/rejected": -260.92022705078125, + "loss": 0.5268, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.2192639857530594, + "rewards/margins": 2.660447120666504, + "rewards/rejected": -2.879711151123047, "step": 590 }, { "epoch": 0.31, "learning_rate": 4.982788296041308e-07, - "logits/chosen": -3.086638927459717, - "logits/rejected": -3.0424036979675293, - "logps/chosen": -224.4527587890625, - "logps/rejected": -277.10101318359375, - "loss": 0.6062, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.10577313601970673, - "rewards/margins": 2.2690727710723877, - "rewards/rejected": -2.3748459815979004, + "logits/chosen": -3.0547099113464355, + "logits/rejected": -2.9891700744628906, + "logps/chosen": -222.3723907470703, + "logps/rejected": -231.1334991455078, + "loss": 0.4096, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.08543944358825684, + "rewards/margins": 3.1246132850646973, + "rewards/rejected": -3.0391738414764404, "step": 600 }, { "epoch": 0.31, - "eval_logits/chosen": -2.973641872406006, - "eval_logits/rejected": -3.0394279956817627, - "eval_logps/chosen": -243.62783813476562, - "eval_logps/rejected": -295.17205810546875, - "eval_loss": 0.45028701424598694, - "eval_rewards/accuracies": 0.7979999780654907, - "eval_rewards/chosen": 0.40523144602775574, - "eval_rewards/margins": 2.3086962699890137, - "eval_rewards/rejected": -1.903464913368225, - "eval_runtime": 278.8163, - "eval_samples_per_second": 7.173, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.9718706607818604, + "eval_logits/rejected": -3.0164618492126465, + "eval_logps/chosen": -242.2559051513672, + "eval_logps/rejected": -252.58877563476562, + "eval_loss": 0.36097678542137146, + "eval_rewards/accuracies": 0.8379999995231628, + "eval_rewards/chosen": 0.555860698223114, + "eval_rewards/margins": 3.241353750228882, + "eval_rewards/rejected": -2.685493230819702, + "eval_runtime": 276.2763, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 0.452, "step": 600 }, { "epoch": 0.31, "learning_rate": 4.973226238286479e-07, - "logits/chosen": -3.030365228652954, - "logits/rejected": -3.0040786266326904, - "logps/chosen": -311.5085754394531, - "logps/rejected": -328.2323913574219, - "loss": 0.4114, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.8323566317558289, - "rewards/margins": 2.6754965782165527, - "rewards/rejected": -1.8431400060653687, + "logits/chosen": -3.002805471420288, + "logits/rejected": -3.007511615753174, + "logps/chosen": -307.4808349609375, + "logps/rejected": -306.5906677246094, + "loss": 0.3304, + "rewards/accuracies": 0.875, + "rewards/chosen": 1.2351317405700684, + "rewards/margins": 3.570100784301758, + "rewards/rejected": -2.3349690437316895, "step": 610 }, { "epoch": 0.32, "learning_rate": 4.96366418053165e-07, - "logits/chosen": -3.1455302238464355, - "logits/rejected": -3.1191396713256836, - "logps/chosen": -239.08175659179688, - "logps/rejected": -321.47772216796875, - "loss": 0.4984, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.34067609906196594, - "rewards/margins": 1.8430697917938232, - "rewards/rejected": -1.5023938417434692, + "logits/chosen": -3.127328395843506, + "logits/rejected": -3.0897140502929688, + "logps/chosen": -239.10116577148438, + "logps/rejected": -259.9046936035156, + "loss": 0.4016, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.3386600613594055, + "rewards/margins": 3.0577590465545654, + "rewards/rejected": -2.719099283218384, "step": 620 }, { "epoch": 0.33, "learning_rate": 4.954102122776821e-07, - "logits/chosen": -3.211892604827881, - "logits/rejected": -3.096082925796509, - "logps/chosen": -221.83352661132812, - "logps/rejected": -225.68270874023438, - "loss": 0.3615, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.5400357842445374, - "rewards/margins": 2.6193814277648926, - "rewards/rejected": -2.079345464706421, + "logits/chosen": -3.131995677947998, + "logits/rejected": -3.1096434593200684, + "logps/chosen": -222.3499755859375, + "logps/rejected": -199.81442260742188, + "loss": 0.3026, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.488389790058136, + "rewards/margins": 3.672825574874878, + "rewards/rejected": -3.184436321258545, "step": 630 }, { "epoch": 0.33, "learning_rate": 4.944540065021993e-07, - "logits/chosen": -2.9544904232025146, - "logits/rejected": -3.0490562915802, - "logps/chosen": -211.2279510498047, - "logps/rejected": -266.6455078125, - "loss": 0.5134, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.1349196881055832, - "rewards/margins": 2.3002161979675293, - "rewards/rejected": -2.1652963161468506, + "logits/chosen": -2.9389474391937256, + "logits/rejected": -2.9960570335388184, + "logps/chosen": -217.94302368164062, + "logps/rejected": -229.5566864013672, + "loss": 0.4824, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.15888763964176178, + "rewards/margins": 2.7971763610839844, + "rewards/rejected": -2.6382882595062256, "step": 640 }, { "epoch": 0.34, "learning_rate": 4.934978007267163e-07, - "logits/chosen": -3.1061511039733887, - "logits/rejected": -3.083832263946533, - "logps/chosen": -229.7516632080078, - "logps/rejected": -296.2362060546875, - "loss": 0.4771, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.3486045002937317, - "rewards/margins": 2.3736672401428223, - "rewards/rejected": -2.0250627994537354, + "logits/chosen": -3.0887157917022705, + "logits/rejected": -3.0468525886535645, + "logps/chosen": -229.8500518798828, + "logps/rejected": -230.75021362304688, + "loss": 0.3801, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.337777704000473, + "rewards/margins": 3.744509220123291, + "rewards/rejected": -3.406731367111206, "step": 650 }, { "epoch": 0.34, "learning_rate": 4.925415949512335e-07, - "logits/chosen": -3.0613837242126465, - "logits/rejected": -3.051008701324463, - "logps/chosen": -301.8056640625, - "logps/rejected": -314.62066650390625, - "loss": 0.4375, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.7906830310821533, - "rewards/margins": 3.2094345092773438, - "rewards/rejected": -2.4187512397766113, + "logits/chosen": -3.043938636779785, + "logits/rejected": -3.0380680561065674, + "logps/chosen": -305.6777038574219, + "logps/rejected": -273.5068359375, + "loss": 0.4224, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.4034802317619324, + "rewards/margins": 3.009141445159912, + "rewards/rejected": -2.605661153793335, "step": 660 }, { "epoch": 0.35, "learning_rate": 4.915853891757506e-07, - "logits/chosen": -3.042600154876709, - "logits/rejected": -3.056886672973633, - "logps/chosen": -180.24853515625, - "logps/rejected": -310.36590576171875, - "loss": 0.5875, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.13172228634357452, - "rewards/margins": 2.5716919898986816, - "rewards/rejected": -2.70341420173645, + "logits/chosen": -2.9996867179870605, + "logits/rejected": -3.0021872520446777, + "logps/chosen": -177.00247192382812, + "logps/rejected": -259.97943115234375, + "loss": 0.4001, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.1928858757019043, + "rewards/margins": 3.253997802734375, + "rewards/rejected": -3.0611119270324707, "step": 670 }, { "epoch": 0.35, "learning_rate": 4.906291834002677e-07, - "logits/chosen": -3.0984647274017334, - "logits/rejected": -3.095193386077881, - "logps/chosen": -238.5060577392578, - "logps/rejected": -300.75897216796875, - "loss": 0.4938, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.11066919565200806, - "rewards/margins": 2.522183656692505, - "rewards/rejected": -2.4115145206451416, + "logits/chosen": -3.018584728240967, + "logits/rejected": -3.014846086502075, + "logps/chosen": -236.12661743164062, + "logps/rejected": -238.1277618408203, + "loss": 0.3585, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.347831666469574, + "rewards/margins": 4.087057590484619, + "rewards/rejected": -3.7392261028289795, "step": 680 }, { "epoch": 0.36, "learning_rate": 4.896729776247848e-07, - "logits/chosen": -3.10922908782959, - "logits/rejected": -3.1270406246185303, - "logps/chosen": -289.7776184082031, - "logps/rejected": -288.74310302734375, - "loss": 0.5012, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.14315786957740784, - "rewards/margins": 2.4653332233428955, - "rewards/rejected": -2.3221755027770996, + "logits/chosen": -3.0931556224823, + "logits/rejected": -3.0879898071289062, + "logps/chosen": -286.33917236328125, + "logps/rejected": -240.1806640625, + "loss": 0.3696, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.48345947265625, + "rewards/margins": 3.4146294593811035, + "rewards/rejected": -2.9311702251434326, "step": 690 }, { "epoch": 0.36, "learning_rate": 4.88716771849302e-07, - "logits/chosen": -3.106825351715088, - "logits/rejected": -3.0666584968566895, - "logps/chosen": -318.9488525390625, - "logps/rejected": -314.478759765625, - "loss": 0.4228, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.45823708176612854, - "rewards/margins": 2.8160953521728516, - "rewards/rejected": -2.357858180999756, + "logits/chosen": -3.0839028358459473, + "logits/rejected": -3.054001569747925, + "logps/chosen": -313.2236022949219, + "logps/rejected": -278.3089294433594, + "loss": 0.3551, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 1.030763864517212, + "rewards/margins": 3.946103572845459, + "rewards/rejected": -2.915339708328247, "step": 700 }, { "epoch": 0.36, - "eval_logits/chosen": -2.9973301887512207, - "eval_logits/rejected": -3.065870523452759, - "eval_logps/chosen": -248.1629180908203, - "eval_logps/rejected": -302.49688720703125, - "eval_loss": 0.5025976896286011, - "eval_rewards/accuracies": 0.8199999928474426, - "eval_rewards/chosen": -0.04827720671892166, - "eval_rewards/margins": 2.5876684188842773, - "eval_rewards/rejected": -2.635946035385132, - "eval_runtime": 278.7128, - "eval_samples_per_second": 7.176, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -3.0171825885772705, + "eval_logits/rejected": -3.0461111068725586, + "eval_logps/chosen": -243.53004455566406, + "eval_logps/rejected": -256.5924072265625, + "eval_loss": 0.3507755398750305, + "eval_rewards/accuracies": 0.8519999980926514, + "eval_rewards/chosen": 0.42844560742378235, + "eval_rewards/margins": 3.51430344581604, + "eval_rewards/rejected": -3.08585786819458, + "eval_runtime": 275.8036, + "eval_samples_per_second": 7.252, + "eval_steps_per_second": 0.453, "step": 700 }, { "epoch": 0.37, "learning_rate": 4.87760566073819e-07, - "logits/chosen": -3.006333827972412, - "logits/rejected": -3.0047171115875244, - "logps/chosen": -304.8209533691406, - "logps/rejected": -291.0613708496094, - "loss": 0.4523, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.8323618173599243, - "rewards/margins": 2.86159610748291, - "rewards/rejected": -2.0292341709136963, + "logits/chosen": -2.9919934272766113, + "logits/rejected": -3.002340316772461, + "logps/chosen": -305.581298828125, + "logps/rejected": -261.66326904296875, + "loss": 0.3482, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.7561184167861938, + "rewards/margins": 3.418851375579834, + "rewards/rejected": -2.6627330780029297, "step": 710 }, { "epoch": 0.37, "learning_rate": 4.868043602983362e-07, - "logits/chosen": -3.0522098541259766, - "logits/rejected": -3.1009223461151123, - "logps/chosen": -246.0616455078125, - "logps/rejected": -343.6761169433594, - "loss": 0.4488, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.4445156157016754, - "rewards/margins": 3.416804552078247, - "rewards/rejected": -2.9722886085510254, + "logits/chosen": -3.083059787750244, + "logits/rejected": -3.0950496196746826, + "logps/chosen": -242.6635284423828, + "logps/rejected": -289.9736328125, + "loss": 0.3385, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7847038507461548, + "rewards/margins": 3.952576160430908, + "rewards/rejected": -3.167872190475464, "step": 720 }, { "epoch": 0.38, "learning_rate": 4.858481545228533e-07, - "logits/chosen": -3.0612289905548096, - "logits/rejected": -3.070131301879883, - "logps/chosen": -283.8197021484375, - "logps/rejected": -327.98480224609375, - "loss": 0.4436, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.8823187947273254, - "rewards/margins": 3.157557249069214, - "rewards/rejected": -2.2752389907836914, + "logits/chosen": -3.087794542312622, + "logits/rejected": -3.094449520111084, + "logps/chosen": -282.00323486328125, + "logps/rejected": -273.85736083984375, + "loss": 0.3407, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 1.0639668703079224, + "rewards/margins": 4.271727561950684, + "rewards/rejected": -3.2077605724334717, "step": 730 }, { "epoch": 0.38, "learning_rate": 4.848919487473704e-07, - "logits/chosen": -2.9994044303894043, - "logits/rejected": -3.0417821407318115, - "logps/chosen": -270.36944580078125, - "logps/rejected": -349.9217224121094, - "loss": 0.6635, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.09951668977737427, - "rewards/margins": 2.4409453868865967, - "rewards/rejected": -2.5404622554779053, + "logits/chosen": -3.071039915084839, + "logits/rejected": -3.1214590072631836, + "logps/chosen": -267.3724060058594, + "logps/rejected": -270.156982421875, + "loss": 0.3666, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.20018239319324493, + "rewards/margins": 3.6401877403259277, + "rewards/rejected": -3.44000506401062, "step": 740 }, { "epoch": 0.39, "learning_rate": 4.839357429718875e-07, - "logits/chosen": -3.0597434043884277, - "logits/rejected": -3.095123767852783, - "logps/chosen": -274.39984130859375, - "logps/rejected": -296.67333984375, - "loss": 0.5506, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": -0.694625973701477, - "rewards/margins": 1.9568843841552734, - "rewards/rejected": -2.651510238647461, + "logits/chosen": -3.1300594806671143, + "logits/rejected": -3.1247377395629883, + "logps/chosen": -274.65997314453125, + "logps/rejected": -251.75552368164062, + "loss": 0.3411, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.4149935841560364, + "rewards/margins": 3.0045580863952637, + "rewards/rejected": -3.419551372528076, "step": 750 }, { "epoch": 0.39, "learning_rate": 4.829795371964047e-07, - "logits/chosen": -3.0193636417388916, - "logits/rejected": -3.0218288898468018, - "logps/chosen": -250.9928436279297, - "logps/rejected": -305.6485900878906, - "loss": 0.4669, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.4517936110496521, - "rewards/margins": 2.732727527618408, - "rewards/rejected": -2.2809338569641113, + "logits/chosen": -3.0830349922180176, + "logits/rejected": -3.07245135307312, + "logps/chosen": -249.1414794921875, + "logps/rejected": -256.0914001464844, + "loss": 0.2612, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.6374005079269409, + "rewards/margins": 4.378551006317139, + "rewards/rejected": -3.741150379180908, "step": 760 }, { "epoch": 0.4, "learning_rate": 4.820233314209217e-07, - "logits/chosen": -2.859200954437256, - "logits/rejected": -2.8881382942199707, - "logps/chosen": -231.7881622314453, - "logps/rejected": -282.0718078613281, - "loss": 0.7131, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.0762915164232254, - "rewards/margins": 2.6063120365142822, - "rewards/rejected": -2.530020236968994, + "logits/chosen": -2.949800968170166, + "logits/rejected": -2.952327251434326, + "logps/chosen": -227.336181640625, + "logps/rejected": -227.2093048095703, + "loss": 0.4312, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.45587724447250366, + "rewards/margins": 3.9016730785369873, + "rewards/rejected": -3.4457955360412598, "step": 770 }, { "epoch": 0.4, "learning_rate": 4.810671256454389e-07, - "logits/chosen": -2.876128673553467, - "logits/rejected": -2.917257070541382, - "logps/chosen": -291.61676025390625, - "logps/rejected": -338.470703125, - "loss": 0.4415, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.1621725857257843, - "rewards/margins": 2.5127065181732178, - "rewards/rejected": -2.350533962249756, + "logits/chosen": -2.9111714363098145, + "logits/rejected": -2.937349557876587, + "logps/chosen": -290.6889343261719, + "logps/rejected": -311.72991943359375, + "loss": 0.3317, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2621701657772064, + "rewards/margins": 4.011106014251709, + "rewards/rejected": -3.7489356994628906, "step": 780 }, { "epoch": 0.41, "learning_rate": 4.80110919869956e-07, - "logits/chosen": -2.9246349334716797, - "logits/rejected": -2.957263231277466, - "logps/chosen": -266.44879150390625, - "logps/rejected": -268.9879150390625, - "loss": 0.4671, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.4171672761440277, - "rewards/margins": 2.7636542320251465, - "rewards/rejected": -2.346486806869507, + "logits/chosen": -2.9570553302764893, + "logits/rejected": -2.969442844390869, + "logps/chosen": -274.3780517578125, + "logps/rejected": -217.66665649414062, + "loss": 0.3759, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.034467197954654694, + "rewards/margins": 3.9245452880859375, + "rewards/rejected": -3.890077590942383, "step": 790 }, { "epoch": 0.41, "learning_rate": 4.791547140944731e-07, - "logits/chosen": -2.82590651512146, - "logits/rejected": -2.85862398147583, - "logps/chosen": -220.08151245117188, - "logps/rejected": -262.57232666015625, - "loss": 0.5396, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.6729877591133118, - "rewards/margins": 2.703979253768921, - "rewards/rejected": -2.030991792678833, + "logits/chosen": -2.845878839492798, + "logits/rejected": -2.8583321571350098, + "logps/chosen": -223.0182647705078, + "logps/rejected": -220.53823852539062, + "loss": 0.3751, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.6028121709823608, + "rewards/margins": 4.598453998565674, + "rewards/rejected": -3.9956417083740234, "step": 800 }, { "epoch": 0.41, - "eval_logits/chosen": -2.855961561203003, - "eval_logits/rejected": -2.910541296005249, - "eval_logps/chosen": -242.5603485107422, - "eval_logps/rejected": -296.4591979980469, - "eval_loss": 0.4615330398082733, - "eval_rewards/accuracies": 0.8059999942779541, - "eval_rewards/chosen": 0.5119800567626953, - "eval_rewards/margins": 2.544154405593872, - "eval_rewards/rejected": -2.0321743488311768, - "eval_runtime": 279.1099, - "eval_samples_per_second": 7.166, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.8919763565063477, + "eval_logits/rejected": -2.917020559310913, + "eval_logps/chosen": -243.23812866210938, + "eval_logps/rejected": -259.6291809082031, + "eval_loss": 0.3683261573314667, + "eval_rewards/accuracies": 0.8479999899864197, + "eval_rewards/chosen": 0.4576367735862732, + "eval_rewards/margins": 3.847170829772949, + "eval_rewards/rejected": -3.3895342350006104, + "eval_runtime": 276.2939, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 0.452, "step": 800 }, { "epoch": 0.42, "learning_rate": 4.781985083189902e-07, - "logits/chosen": -2.9013190269470215, - "logits/rejected": -2.9036900997161865, - "logps/chosen": -205.20681762695312, - "logps/rejected": -307.65447998046875, - "loss": 0.5874, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.207598015666008, - "rewards/margins": 2.063453197479248, - "rewards/rejected": -1.8558553457260132, + "logits/chosen": -2.93121600151062, + "logits/rejected": -2.9262402057647705, + "logps/chosen": -205.78897094726562, + "logps/rejected": -264.36810302734375, + "loss": 0.488, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14966289699077606, + "rewards/margins": 3.21899676322937, + "rewards/rejected": -3.069333553314209, "step": 810 }, { "epoch": 0.42, "learning_rate": 4.772423025435074e-07, - "logits/chosen": -2.8757100105285645, - "logits/rejected": -2.8260178565979004, - "logps/chosen": -298.41424560546875, - "logps/rejected": -329.42718505859375, - "loss": 1.0314, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.4544452130794525, - "rewards/margins": 1.7502931356430054, - "rewards/rejected": -1.2958478927612305, + "logits/chosen": -2.914243221282959, + "logits/rejected": -2.904094696044922, + "logps/chosen": -299.1681213378906, + "logps/rejected": -293.0981750488281, + "loss": 0.5188, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3790614604949951, + "rewards/margins": 2.6974353790283203, + "rewards/rejected": -2.318373918533325, "step": 820 }, { "epoch": 0.43, "learning_rate": 4.762860967680244e-07, - "logits/chosen": -2.871217727661133, - "logits/rejected": -2.9765567779541016, - "logps/chosen": -175.9529571533203, - "logps/rejected": -234.138427734375, - "loss": 0.7217, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.6448418498039246, - "rewards/margins": 2.582141876220703, - "rewards/rejected": -1.9372999668121338, + "logits/chosen": -2.963219165802002, + "logits/rejected": -3.012232542037964, + "logps/chosen": -176.5527801513672, + "logps/rejected": -220.8793487548828, + "loss": 0.3004, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.5848600268363953, + "rewards/margins": 3.696707248687744, + "rewards/rejected": -3.111847162246704, "step": 830 }, { "epoch": 0.43, "learning_rate": 4.7532989099254154e-07, - "logits/chosen": -2.862799882888794, - "logits/rejected": -2.8322913646698, - "logps/chosen": -224.6620635986328, - "logps/rejected": -297.45562744140625, - "loss": 0.4842, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.3813169598579407, - "rewards/margins": 2.5376696586608887, - "rewards/rejected": -2.1563527584075928, + "logits/chosen": -2.890906810760498, + "logits/rejected": -2.882544994354248, + "logps/chosen": -224.2097625732422, + "logps/rejected": -245.983154296875, + "loss": 0.3347, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.42617067694664, + "rewards/margins": 3.7636287212371826, + "rewards/rejected": -3.3374581336975098, "step": 840 }, { "epoch": 0.44, "learning_rate": 4.7437368521705866e-07, - "logits/chosen": -2.974093437194824, - "logits/rejected": -2.984872579574585, - "logps/chosen": -197.2064208984375, - "logps/rejected": -292.5155944824219, - "loss": 0.5179, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.9365224838256836, - "rewards/margins": 3.0235087871551514, - "rewards/rejected": -2.0869860649108887, + "logits/chosen": -2.982151985168457, + "logits/rejected": -2.9996001720428467, + "logps/chosen": -196.6096649169922, + "logps/rejected": -246.6396484375, + "loss": 0.3384, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.9950971603393555, + "rewards/margins": 4.351628303527832, + "rewards/rejected": -3.3565316200256348, "step": 850 }, { "epoch": 0.44, "learning_rate": 4.7341747944157577e-07, - "logits/chosen": -2.951917886734009, - "logits/rejected": -2.975085735321045, - "logps/chosen": -277.63653564453125, - "logps/rejected": -305.8817138671875, - "loss": 0.4853, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.252098023891449, - "rewards/margins": 2.036212205886841, - "rewards/rejected": -1.7841142416000366, + "logits/chosen": -2.9732279777526855, + "logits/rejected": -2.9481117725372314, + "logps/chosen": -276.61676025390625, + "logps/rejected": -244.1964569091797, + "loss": 0.4036, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.35451942682266235, + "rewards/margins": 3.730921506881714, + "rewards/rejected": -3.376401901245117, "step": 860 }, { "epoch": 0.45, "learning_rate": 4.724612736660929e-07, - "logits/chosen": -2.8899247646331787, - "logits/rejected": -2.867297887802124, - "logps/chosen": -254.412353515625, - "logps/rejected": -274.4820861816406, - "loss": 0.4448, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.6464666128158569, - "rewards/margins": 2.7713568210601807, - "rewards/rejected": -2.124890089035034, + "logits/chosen": -2.905460834503174, + "logits/rejected": -2.919121503829956, + "logps/chosen": -253.0126495361328, + "logps/rejected": -250.7054901123047, + "loss": 0.3132, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.7869751453399658, + "rewards/margins": 4.713421821594238, + "rewards/rejected": -3.9264464378356934, "step": 870 }, { "epoch": 0.45, "learning_rate": 4.7150506789061006e-07, - "logits/chosen": -2.966210126876831, - "logits/rejected": -2.979029655456543, - "logps/chosen": -266.33978271484375, - "logps/rejected": -354.4034729003906, - "loss": 0.5281, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.17260950803756714, - "rewards/margins": 2.235333204269409, - "rewards/rejected": -2.0627236366271973, + "logits/chosen": -2.969820499420166, + "logits/rejected": -2.969398021697998, + "logps/chosen": -268.5586853027344, + "logps/rejected": -330.3875732421875, + "loss": 0.5103, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.04928023740649223, + "rewards/margins": 3.3973872661590576, + "rewards/rejected": -3.446667432785034, "step": 880 }, { "epoch": 0.46, "learning_rate": 4.7054886211512717e-07, - "logits/chosen": -3.0034615993499756, - "logits/rejected": -3.0113863945007324, - "logps/chosen": -236.93765258789062, - "logps/rejected": -301.739013671875, - "loss": 0.4411, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.5323739051818848, - "rewards/margins": 2.8146984577178955, - "rewards/rejected": -2.2823245525360107, + "logits/chosen": -2.9655165672302246, + "logits/rejected": -2.9372172355651855, + "logps/chosen": -237.802734375, + "logps/rejected": -231.11941528320312, + "loss": 0.3325, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.4452648162841797, + "rewards/margins": 4.296952247619629, + "rewards/rejected": -3.8516876697540283, "step": 890 }, { "epoch": 0.46, "learning_rate": 4.695926563396443e-07, - "logits/chosen": -3.032571315765381, - "logits/rejected": -3.023155927658081, - "logps/chosen": -244.9833221435547, - "logps/rejected": -273.37054443359375, - "loss": 0.5377, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.09555510431528091, - "rewards/margins": 2.1211750507354736, - "rewards/rejected": -2.0256197452545166, + "logits/chosen": -3.002549648284912, + "logits/rejected": -3.001309871673584, + "logps/chosen": -245.85427856445312, + "logps/rejected": -246.02255249023438, + "loss": 0.4334, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.008461520075798035, + "rewards/margins": 3.1996092796325684, + "rewards/rejected": -3.191147804260254, "step": 900 }, { "epoch": 0.46, - "eval_logits/chosen": -2.904477834701538, - "eval_logits/rejected": -2.9651331901550293, - "eval_logps/chosen": -242.6551513671875, - "eval_logps/rejected": -295.7052001953125, - "eval_loss": 0.4912913739681244, - "eval_rewards/accuracies": 0.7960000038146973, - "eval_rewards/chosen": 0.5024977922439575, - "eval_rewards/margins": 2.4592745304107666, - "eval_rewards/rejected": -1.9567766189575195, - "eval_runtime": 278.6054, - "eval_samples_per_second": 7.179, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.896383762359619, + "eval_logits/rejected": -2.9298884868621826, + "eval_logps/chosen": -244.29901123046875, + "eval_logps/rejected": -261.3555603027344, + "eval_loss": 0.3622107207775116, + "eval_rewards/accuracies": 0.8420000076293945, + "eval_rewards/chosen": 0.3515479564666748, + "eval_rewards/margins": 3.9137206077575684, + "eval_rewards/rejected": -3.5621724128723145, + "eval_runtime": 276.3628, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 0.452, "step": 900 }, { "epoch": 0.47, "learning_rate": 4.686364505641614e-07, - "logits/chosen": -2.992729663848877, - "logits/rejected": -2.9883501529693604, - "logps/chosen": -262.54608154296875, - "logps/rejected": -270.30059814453125, - "loss": 0.4715, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.4884259104728699, - "rewards/margins": 2.7323694229125977, - "rewards/rejected": -2.243943691253662, + "logits/chosen": -2.9575304985046387, + "logits/rejected": -2.977433443069458, + "logps/chosen": -263.6480407714844, + "logps/rejected": -227.0202178955078, + "loss": 0.3494, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3412472605705261, + "rewards/margins": 4.021925449371338, + "rewards/rejected": -3.680677890777588, "step": 910 }, { "epoch": 0.47, "learning_rate": 4.676802447886785e-07, - "logits/chosen": -2.9018282890319824, - "logits/rejected": -2.9027373790740967, - "logps/chosen": -248.11807250976562, - "logps/rejected": -285.3119201660156, - "loss": 0.5509, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.48879727721214294, - "rewards/margins": 2.8756463527679443, - "rewards/rejected": -2.3868489265441895, + "logits/chosen": -2.882467269897461, + "logits/rejected": -2.8895153999328613, + "logps/chosen": -248.9755096435547, + "logps/rejected": -272.48162841796875, + "loss": 0.388, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.403644859790802, + "rewards/margins": 4.087143898010254, + "rewards/rejected": -3.6834990978240967, "step": 920 }, { "epoch": 0.48, "learning_rate": 4.6672403901319564e-07, - "logits/chosen": -2.9280457496643066, - "logits/rejected": -2.954550266265869, - "logps/chosen": -214.4419403076172, - "logps/rejected": -278.14483642578125, - "loss": 0.3943, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.816567063331604, - "rewards/margins": 2.9894230365753174, - "rewards/rejected": -2.1728556156158447, + "logits/chosen": -2.9266304969787598, + "logits/rejected": -2.957611322402954, + "logps/chosen": -216.09829711914062, + "logps/rejected": -250.50033569335938, + "loss": 0.306, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.6513862609863281, + "rewards/margins": 5.192070007324219, + "rewards/rejected": -4.540683746337891, "step": 930 }, { "epoch": 0.49, "learning_rate": 4.6576783323771275e-07, - "logits/chosen": -2.8166980743408203, - "logits/rejected": -2.7993063926696777, - "logps/chosen": -204.55262756347656, - "logps/rejected": -259.32012939453125, - "loss": 0.5375, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.009540450759232044, - "rewards/margins": 2.455078601837158, - "rewards/rejected": -2.445538282394409, + "logits/chosen": -2.834805965423584, + "logits/rejected": -2.8517351150512695, + "logps/chosen": -205.66720581054688, + "logps/rejected": -231.88125610351562, + "loss": 0.2399, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.20726962387561798, + "rewards/margins": 4.4151930809021, + "rewards/rejected": -4.20792293548584, "step": 940 }, { "epoch": 0.49, "learning_rate": 4.6481162746222987e-07, - "logits/chosen": -2.8641529083251953, - "logits/rejected": -2.8614368438720703, - "logps/chosen": -309.6964111328125, - "logps/rejected": -332.9620666503906, - "loss": 0.4252, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": -0.04014641046524048, - "rewards/margins": 3.2200770378112793, - "rewards/rejected": -3.260223388671875, + "logits/chosen": -2.864827871322632, + "logits/rejected": -2.861743211746216, + "logps/chosen": -313.04339599609375, + "logps/rejected": -273.49432373046875, + "loss": 0.5432, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -0.1318279504776001, + "rewards/margins": 3.865988254547119, + "rewards/rejected": -3.9978160858154297, "step": 950 }, { "epoch": 0.5, "learning_rate": 4.63855421686747e-07, - "logits/chosen": -2.892148494720459, - "logits/rejected": -2.906142473220825, - "logps/chosen": -258.5798645019531, - "logps/rejected": -293.4261474609375, - "loss": 0.4546, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.11958281695842743, - "rewards/margins": 2.9684510231018066, - "rewards/rejected": -2.8488681316375732, + "logits/chosen": -2.8962018489837646, + "logits/rejected": -2.9046082496643066, + "logps/chosen": -258.37567138671875, + "logps/rejected": -253.084228515625, + "loss": 0.3532, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.13810980319976807, + "rewards/margins": 4.1932501792907715, + "rewards/rejected": -4.055140495300293, "step": 960 }, { "epoch": 0.5, "learning_rate": 4.628992159112641e-07, - "logits/chosen": -2.8312416076660156, - "logits/rejected": -2.909719228744507, - "logps/chosen": -231.47561645507812, - "logps/rejected": -321.9608459472656, - "loss": 0.4045, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.29932117462158203, - "rewards/margins": 2.905813217163086, - "rewards/rejected": -2.606492042541504, + "logits/chosen": -2.8867530822753906, + "logits/rejected": -2.9197027683258057, + "logps/chosen": -230.91952514648438, + "logps/rejected": -266.8638610839844, + "loss": 0.3635, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.36682578921318054, + "rewards/margins": 4.398791313171387, + "rewards/rejected": -4.031965732574463, "step": 970 }, { "epoch": 0.51, "learning_rate": 4.6194301013578116e-07, - "logits/chosen": -2.8688275814056396, - "logits/rejected": -2.9191553592681885, - "logps/chosen": -308.28192138671875, - "logps/rejected": -300.1478576660156, - "loss": 0.4955, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.015064060688018799, - "rewards/margins": 2.7704272270202637, - "rewards/rejected": -2.7854912281036377, + "logits/chosen": -2.8974499702453613, + "logits/rejected": -2.910565137863159, + "logps/chosen": -306.74481201171875, + "logps/rejected": -269.63983154296875, + "loss": 0.3329, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.13864299654960632, + "rewards/margins": 4.484843730926514, + "rewards/rejected": -4.346201419830322, "step": 980 }, { "epoch": 0.51, "learning_rate": 4.609868043602983e-07, - "logits/chosen": -2.9763271808624268, - "logits/rejected": -2.945359706878662, - "logps/chosen": -205.1085968017578, - "logps/rejected": -291.16156005859375, - "loss": 0.3997, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.6670485734939575, - "rewards/margins": 3.19486927986145, - "rewards/rejected": -2.5278208255767822, + "logits/chosen": -2.9434916973114014, + "logits/rejected": -2.9197819232940674, + "logps/chosen": -208.92599487304688, + "logps/rejected": -235.34622192382812, + "loss": 0.2971, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.28640270233154297, + "rewards/margins": 5.084868907928467, + "rewards/rejected": -4.798466205596924, "step": 990 }, { "epoch": 0.52, "learning_rate": 4.600305985848154e-07, - "logits/chosen": -2.8994736671447754, - "logits/rejected": -2.9172794818878174, - "logps/chosen": -231.45339965820312, - "logps/rejected": -294.75274658203125, - "loss": 0.4886, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.15061122179031372, - "rewards/margins": 2.3647937774658203, - "rewards/rejected": -2.2141823768615723, + "logits/chosen": -2.89619779586792, + "logits/rejected": -2.8711445331573486, + "logps/chosen": -233.331787109375, + "logps/rejected": -267.5240173339844, + "loss": 0.4673, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.037229109555482864, + "rewards/margins": 3.746826171875, + "rewards/rejected": -3.784055233001709, "step": 1000 }, { "epoch": 0.52, - "eval_logits/chosen": -2.8935189247131348, - "eval_logits/rejected": -2.9735095500946045, - "eval_logps/chosen": -246.81280517578125, - "eval_logps/rejected": -304.04638671875, - "eval_loss": 0.44952473044395447, - "eval_rewards/accuracies": 0.8059999942779541, - "eval_rewards/chosen": 0.0867358073592186, - "eval_rewards/margins": 2.877634286880493, - "eval_rewards/rejected": -2.790898084640503, - "eval_runtime": 278.697, - "eval_samples_per_second": 7.176, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.8946895599365234, + "eval_logits/rejected": -2.9296886920928955, + "eval_logps/chosen": -244.138916015625, + "eval_logps/rejected": -265.858642578125, + "eval_loss": 0.3594502806663513, + "eval_rewards/accuracies": 0.871999979019165, + "eval_rewards/chosen": 0.36755701899528503, + "eval_rewards/margins": 4.380038738250732, + "eval_rewards/rejected": -4.012481689453125, + "eval_runtime": 276.1306, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 0.453, "step": 1000 }, { "epoch": 0.52, "learning_rate": 4.590743928093325e-07, - "logits/chosen": -2.7804977893829346, - "logits/rejected": -2.7935779094696045, - "logps/chosen": -289.16845703125, - "logps/rejected": -317.79296875, - "loss": 0.4715, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.14265815913677216, - "rewards/margins": 2.7420856952667236, - "rewards/rejected": -2.5994274616241455, + "logits/chosen": -2.779526472091675, + "logits/rejected": -2.7811527252197266, + "logps/chosen": -292.5054016113281, + "logps/rejected": -267.2178649902344, + "loss": 0.4625, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.19104191660881042, + "rewards/margins": 3.901341676712036, + "rewards/rejected": -4.09238338470459, "step": 1010 }, { "epoch": 0.53, "learning_rate": 4.581181870338497e-07, - "logits/chosen": -2.856433391571045, - "logits/rejected": -2.861987590789795, - "logps/chosen": -326.0638732910156, - "logps/rejected": -347.38494873046875, - "loss": 0.375, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.2731901705265045, - "rewards/margins": 3.395005702972412, - "rewards/rejected": -3.1218154430389404, + "logits/chosen": -2.822347640991211, + "logits/rejected": -2.8429737091064453, + "logps/chosen": -327.43121337890625, + "logps/rejected": -296.83953857421875, + "loss": 0.3805, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.13645482063293457, + "rewards/margins": 4.649985313415527, + "rewards/rejected": -4.513530254364014, "step": 1020 }, { "epoch": 0.53, "learning_rate": 4.571619812583668e-07, - "logits/chosen": -2.915353536605835, - "logits/rejected": -2.9367640018463135, - "logps/chosen": -259.37750244140625, - "logps/rejected": -325.2166442871094, - "loss": 0.4497, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.03804367035627365, - "rewards/margins": 2.7071585655212402, - "rewards/rejected": -2.6691150665283203, + "logits/chosen": -2.9043850898742676, + "logits/rejected": -2.88726806640625, + "logps/chosen": -271.67852783203125, + "logps/rejected": -256.3220520019531, + "loss": 0.5199, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.44141021370887756, + "rewards/margins": 4.01367712020874, + "rewards/rejected": -4.455087184906006, "step": 1030 }, { "epoch": 0.54, "learning_rate": 4.562057754828839e-07, - "logits/chosen": -2.8658149242401123, - "logits/rejected": -2.855698823928833, - "logps/chosen": -284.882568359375, - "logps/rejected": -290.2754821777344, - "loss": 0.5011, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.2647832930088043, - "rewards/margins": 3.2260468006134033, - "rewards/rejected": -2.9612631797790527, + "logits/chosen": -2.8271443843841553, + "logits/rejected": -2.8248634338378906, + "logps/chosen": -286.39593505859375, + "logps/rejected": -248.5173797607422, + "loss": 0.4351, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.11344115436077118, + "rewards/margins": 4.329672813415527, + "rewards/rejected": -4.216231346130371, "step": 1040 }, { "epoch": 0.54, "learning_rate": 4.55249569707401e-07, - "logits/chosen": -2.91452956199646, - "logits/rejected": -2.936554431915283, - "logps/chosen": -212.68734741210938, - "logps/rejected": -293.65185546875, - "loss": 0.4751, + "logits/chosen": -2.875030994415283, + "logits/rejected": -2.884498357772827, + "logps/chosen": -213.53286743164062, + "logps/rejected": -233.95266723632812, + "loss": 0.3342, "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.3139684796333313, - "rewards/margins": 2.481248617172241, - "rewards/rejected": -2.7952170372009277, + "rewards/chosen": -0.39852023124694824, + "rewards/margins": 3.867176055908203, + "rewards/rejected": -4.2656965255737305, "step": 1050 }, { "epoch": 0.55, "learning_rate": 4.5429336393191814e-07, - "logits/chosen": -2.83843994140625, - "logits/rejected": -2.899430751800537, - "logps/chosen": -221.84915161132812, - "logps/rejected": -261.29046630859375, - "loss": 0.6509, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.24030618369579315, - "rewards/margins": 2.0365347862243652, - "rewards/rejected": -1.7962287664413452, + "logits/chosen": -2.8142218589782715, + "logits/rejected": -2.8583648204803467, + "logps/chosen": -224.10073852539062, + "logps/rejected": -242.09732055664062, + "loss": 0.5221, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.015148389153182507, + "rewards/margins": 3.7241263389587402, + "rewards/rejected": -3.7089781761169434, "step": 1060 }, { "epoch": 0.55, "learning_rate": 4.5333715815643525e-07, - "logits/chosen": -2.9258434772491455, - "logits/rejected": -2.9901933670043945, - "logps/chosen": -266.33856201171875, - "logps/rejected": -313.81488037109375, - "loss": 0.5163, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.2790577709674835, - "rewards/margins": 2.187150478363037, - "rewards/rejected": -2.4662084579467773, + "logits/chosen": -2.940594434738159, + "logits/rejected": -2.9206767082214355, + "logps/chosen": -266.69134521484375, + "logps/rejected": -268.947509765625, + "loss": 0.3638, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.3141533434391022, + "rewards/margins": 3.793126344680786, + "rewards/rejected": -4.1072797775268555, "step": 1070 }, { "epoch": 0.56, "learning_rate": 4.5238095238095237e-07, - "logits/chosen": -2.9200329780578613, - "logits/rejected": -2.912764310836792, - "logps/chosen": -227.3638916015625, - "logps/rejected": -279.1597595214844, - "loss": 0.4705, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.6271113157272339, - "rewards/margins": 2.5784404277801514, - "rewards/rejected": -1.951329231262207, + "logits/chosen": -2.920224905014038, + "logits/rejected": -2.9056763648986816, + "logps/chosen": -229.7510986328125, + "logps/rejected": -259.50152587890625, + "loss": 0.3746, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.38838815689086914, + "rewards/margins": 3.28955340385437, + "rewards/rejected": -2.901165723800659, "step": 1080 }, { "epoch": 0.56, "learning_rate": 4.514247466054695e-07, - "logits/chosen": -2.837622880935669, - "logits/rejected": -2.9189155101776123, - "logps/chosen": -200.35733032226562, - "logps/rejected": -259.71942138671875, - "loss": 0.4103, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.44330382347106934, - "rewards/margins": 2.879500150680542, - "rewards/rejected": -2.4361963272094727, + "logits/chosen": -2.8937878608703613, + "logits/rejected": -2.933677911758423, + "logps/chosen": -198.47744750976562, + "logps/rejected": -233.0421905517578, + "loss": 0.3289, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.6210270524024963, + "rewards/margins": 4.2296671867370605, + "rewards/rejected": -3.60863995552063, "step": 1090 }, { "epoch": 0.57, "learning_rate": 4.504685408299866e-07, - "logits/chosen": -2.7896180152893066, - "logits/rejected": -2.818835496902466, - "logps/chosen": -284.47930908203125, - "logps/rejected": -365.02630615234375, - "loss": 0.4447, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.15361805260181427, - "rewards/margins": 2.5342843532562256, - "rewards/rejected": -2.3806662559509277, + "logits/chosen": -2.815110921859741, + "logits/rejected": -2.8408522605895996, + "logps/chosen": -281.46380615234375, + "logps/rejected": -301.83843994140625, + "loss": 0.363, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4529246687889099, + "rewards/margins": 4.031641006469727, + "rewards/rejected": -3.57871675491333, "step": 1100 }, { "epoch": 0.57, - "eval_logits/chosen": -2.7942800521850586, - "eval_logits/rejected": -2.870736837387085, - "eval_logps/chosen": -244.3843994140625, - "eval_logps/rejected": -300.1572570800781, - "eval_loss": 0.4398292303085327, - "eval_rewards/accuracies": 0.8100000023841858, - "eval_rewards/chosen": 0.3295760750770569, - "eval_rewards/margins": 2.731562614440918, - "eval_rewards/rejected": -2.4019863605499268, - "eval_runtime": 278.8945, - "eval_samples_per_second": 7.171, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.82563853263855, + "eval_logits/rejected": -2.8564233779907227, + "eval_logps/chosen": -242.62884521484375, + "eval_logps/rejected": -261.95819091796875, + "eval_loss": 0.32663393020629883, + "eval_rewards/accuracies": 0.8420000076293945, + "eval_rewards/chosen": 0.5185638666152954, + "eval_rewards/margins": 4.141000747680664, + "eval_rewards/rejected": -3.6224374771118164, + "eval_runtime": 276.1624, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 0.453, "step": 1100 }, { "epoch": 0.57, "learning_rate": 4.495123350545037e-07, - "logits/chosen": -2.8621342182159424, - "logits/rejected": -2.8880348205566406, - "logps/chosen": -304.4235534667969, - "logps/rejected": -333.8514709472656, - "loss": 0.4558, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.11846466362476349, - "rewards/margins": 2.162449359893799, - "rewards/rejected": -2.280913829803467, + "logits/chosen": -2.8903167247772217, + "logits/rejected": -2.8864941596984863, + "logps/chosen": -299.1114807128906, + "logps/rejected": -274.5953063964844, + "loss": 0.3101, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.4142917990684509, + "rewards/margins": 3.717329740524292, + "rewards/rejected": -3.3030381202697754, "step": 1110 }, { "epoch": 0.58, "learning_rate": 4.4855612927902083e-07, - "logits/chosen": -2.8462681770324707, - "logits/rejected": -2.8501837253570557, - "logps/chosen": -270.5716552734375, - "logps/rejected": -292.13238525390625, - "loss": 0.5499, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.028979312628507614, - "rewards/margins": 2.364931583404541, - "rewards/rejected": -2.3359522819519043, + "logits/chosen": -2.8250651359558105, + "logits/rejected": -2.823275566101074, + "logps/chosen": -267.20428466796875, + "logps/rejected": -250.01220703125, + "loss": 0.4423, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.3650149703025818, + "rewards/margins": 4.158513069152832, + "rewards/rejected": -3.7934985160827637, "step": 1120 }, { "epoch": 0.58, "learning_rate": 4.4759992350353795e-07, - "logits/chosen": -2.821378231048584, - "logits/rejected": -2.7994885444641113, - "logps/chosen": -229.1116943359375, - "logps/rejected": -278.6077880859375, - "loss": 0.4073, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.4151396155357361, - "rewards/margins": 3.402142286300659, - "rewards/rejected": -2.9870028495788574, + "logits/chosen": -2.8055803775787354, + "logits/rejected": -2.778596878051758, + "logps/chosen": -227.73779296875, + "logps/rejected": -268.85528564453125, + "loss": 0.2773, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.5525280833244324, + "rewards/margins": 4.836615562438965, + "rewards/rejected": -4.284087657928467, "step": 1130 }, { "epoch": 0.59, "learning_rate": 4.46643717728055e-07, - "logits/chosen": -2.7817282676696777, - "logits/rejected": -2.7494282722473145, - "logps/chosen": -238.2794647216797, - "logps/rejected": -298.65216064453125, - "loss": 0.4258, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.0842290148139, - "rewards/margins": 2.6528801918029785, - "rewards/rejected": -2.568650960922241, + "logits/chosen": -2.791545867919922, + "logits/rejected": -2.7320823669433594, + "logps/chosen": -235.86508178710938, + "logps/rejected": -238.9776153564453, + "loss": 0.315, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.32753780484199524, + "rewards/margins": 4.873213291168213, + "rewards/rejected": -4.545675754547119, "step": 1140 }, { "epoch": 0.59, "learning_rate": 4.4568751195257213e-07, - "logits/chosen": -2.7463810443878174, - "logits/rejected": -2.749958038330078, - "logps/chosen": -273.40826416015625, - "logps/rejected": -321.42889404296875, - "loss": 0.4437, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.8225911855697632, - "rewards/margins": 3.9052181243896484, - "rewards/rejected": -3.0826268196105957, + "logits/chosen": -2.730159044265747, + "logits/rejected": -2.718583106994629, + "logps/chosen": -273.0976867675781, + "logps/rejected": -279.687255859375, + "loss": 0.3027, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.8536493182182312, + "rewards/margins": 5.1891913414001465, + "rewards/rejected": -4.335541725158691, "step": 1150 }, { "epoch": 0.6, "learning_rate": 4.447313061770893e-07, - "logits/chosen": -2.7460649013519287, - "logits/rejected": -2.712463617324829, - "logps/chosen": -253.2078094482422, - "logps/rejected": -373.8803405761719, - "loss": 0.4737, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.4817771315574646, - "rewards/margins": 3.4784576892852783, - "rewards/rejected": -2.996680736541748, + "logits/chosen": -2.7607192993164062, + "logits/rejected": -2.732609272003174, + "logps/chosen": -249.04110717773438, + "logps/rejected": -316.8661193847656, + "loss": 0.3524, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.8961545825004578, + "rewards/margins": 4.206530570983887, + "rewards/rejected": -3.3103766441345215, "step": 1160 }, { "epoch": 0.6, "learning_rate": 4.437751004016064e-07, - "logits/chosen": -2.683619976043701, - "logits/rejected": -2.715719699859619, - "logps/chosen": -225.0996856689453, - "logps/rejected": -282.35101318359375, - "loss": 0.5286, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.4098489284515381, - "rewards/margins": 2.038952589035034, - "rewards/rejected": -2.448801279067993, + "logits/chosen": -2.6869418621063232, + "logits/rejected": -2.69002103805542, + "logps/chosen": -222.70556640625, + "logps/rejected": -270.34661865234375, + "loss": 0.3356, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.17191821336746216, + "rewards/margins": 4.1408281326293945, + "rewards/rejected": -4.312746047973633, "step": 1170 }, { "epoch": 0.61, "learning_rate": 4.4281889462612353e-07, - "logits/chosen": -2.783116340637207, - "logits/rejected": -2.8799386024475098, - "logps/chosen": -219.9791717529297, - "logps/rejected": -270.49774169921875, - "loss": 0.4362, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.3964368999004364, - "rewards/margins": 3.005711078643799, - "rewards/rejected": -2.60927414894104, + "logits/chosen": -2.802142858505249, + "logits/rejected": -2.851046562194824, + "logps/chosen": -219.8386993408203, + "logps/rejected": -247.44796752929688, + "loss": 0.3158, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.4108962416648865, + "rewards/margins": 4.458193778991699, + "rewards/rejected": -4.047297954559326, "step": 1180 }, { "epoch": 0.61, "learning_rate": 4.4186268885064064e-07, - "logits/chosen": -2.8068182468414307, - "logits/rejected": -2.8661465644836426, - "logps/chosen": -277.33917236328125, - "logps/rejected": -300.82470703125, - "loss": 0.5316, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.07599838078022003, - "rewards/margins": 2.2254130840301514, - "rewards/rejected": -2.149414539337158, + "logits/chosen": -2.8008885383605957, + "logits/rejected": -2.7781271934509277, + "logps/chosen": -275.7025146484375, + "logps/rejected": -247.1252899169922, + "loss": 0.3824, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.21768680214881897, + "rewards/margins": 4.099157333374023, + "rewards/rejected": -3.8814704418182373, "step": 1190 }, { "epoch": 0.62, "learning_rate": 4.4090648307515776e-07, - "logits/chosen": -2.8061962127685547, - "logits/rejected": -2.864501476287842, - "logps/chosen": -193.76907348632812, - "logps/rejected": -220.1031036376953, - "loss": 0.4971, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.01744268834590912, - "rewards/margins": 2.256462335586548, - "rewards/rejected": -2.273904800415039, + "logits/chosen": -2.776665210723877, + "logits/rejected": -2.8155367374420166, + "logps/chosen": -198.86993408203125, + "logps/rejected": -206.2711181640625, + "loss": 0.3675, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": -0.1491793394088745, + "rewards/margins": 4.132321357727051, + "rewards/rejected": -4.281500339508057, "step": 1200 }, { "epoch": 0.62, - "eval_logits/chosen": -2.782477855682373, - "eval_logits/rejected": -2.8602378368377686, - "eval_logps/chosen": -242.60581970214844, - "eval_logps/rejected": -298.29931640625, - "eval_loss": 0.44120046496391296, - "eval_rewards/accuracies": 0.7940000295639038, - "eval_rewards/chosen": 0.507435142993927, - "eval_rewards/margins": 2.723625898361206, - "eval_rewards/rejected": -2.2161905765533447, - "eval_runtime": 278.8134, - "eval_samples_per_second": 7.173, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.796037435531616, + "eval_logits/rejected": -2.829378366470337, + "eval_logps/chosen": -241.879638671875, + "eval_logps/rejected": -265.6197204589844, + "eval_loss": 0.3256494104862213, + "eval_rewards/accuracies": 0.8320000171661377, + "eval_rewards/chosen": 0.5934866666793823, + "eval_rewards/margins": 4.582074165344238, + "eval_rewards/rejected": -3.9885876178741455, + "eval_runtime": 276.0153, + "eval_samples_per_second": 7.246, + "eval_steps_per_second": 0.453, "step": 1200 }, { "epoch": 0.62, "learning_rate": 4.399502772996749e-07, - "logits/chosen": -2.8208165168762207, - "logits/rejected": -2.87434720993042, - "logps/chosen": -228.93905639648438, - "logps/rejected": -299.3986511230469, - "loss": 0.4816, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.21798264980316162, - "rewards/margins": 2.769120693206787, - "rewards/rejected": -2.5511374473571777, + "logits/chosen": -2.826685667037964, + "logits/rejected": -2.842958927154541, + "logps/chosen": -225.7928009033203, + "logps/rejected": -257.5811767578125, + "loss": 0.3691, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5290689468383789, + "rewards/margins": 4.952437400817871, + "rewards/rejected": -4.423368453979492, "step": 1210 }, { "epoch": 0.63, "learning_rate": 4.38994071524192e-07, - "logits/chosen": -2.924377918243408, - "logits/rejected": -2.951292037963867, - "logps/chosen": -258.6681823730469, - "logps/rejected": -258.79351806640625, - "loss": 0.4607, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.15325573086738586, - "rewards/margins": 2.3194260597229004, - "rewards/rejected": -2.1661698818206787, + "logits/chosen": -2.8503339290618896, + "logits/rejected": -2.885425090789795, + "logps/chosen": -256.9381103515625, + "logps/rejected": -238.5821533203125, + "loss": 0.3203, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.32661983370780945, + "rewards/margins": 4.4337053298950195, + "rewards/rejected": -4.107085704803467, "step": 1220 }, { "epoch": 0.64, "learning_rate": 4.380378657487091e-07, - "logits/chosen": -2.915999174118042, - "logits/rejected": -2.9771485328674316, - "logps/chosen": -262.7147521972656, - "logps/rejected": -365.8441162109375, - "loss": 0.5435, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.577280580997467, - "rewards/margins": 3.5463051795959473, - "rewards/rejected": -2.969024419784546, + "logits/chosen": -2.807856321334839, + "logits/rejected": -2.8470704555511475, + "logps/chosen": -258.91925048828125, + "logps/rejected": -299.7273254394531, + "loss": 0.3577, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.9549368023872375, + "rewards/margins": 5.6488847732543945, + "rewards/rejected": -4.693947792053223, "step": 1230 }, { "epoch": 0.64, "learning_rate": 4.370816599732262e-07, - "logits/chosen": -2.927295207977295, - "logits/rejected": -3.0081796646118164, - "logps/chosen": -247.39413452148438, - "logps/rejected": -281.930908203125, - "loss": 0.4259, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.4076114594936371, - "rewards/margins": 3.5351879596710205, - "rewards/rejected": -3.1275763511657715, + "logits/chosen": -2.8242220878601074, + "logits/rejected": -2.8554110527038574, + "logps/chosen": -245.8039093017578, + "logps/rejected": -242.9838104248047, + "loss": 0.354, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.567823052406311, + "rewards/margins": 5.013625144958496, + "rewards/rejected": -4.445802211761475, "step": 1240 }, { "epoch": 0.65, "learning_rate": 4.3612545419774334e-07, - "logits/chosen": -2.8849966526031494, - "logits/rejected": -2.914768695831299, - "logps/chosen": -219.829345703125, - "logps/rejected": -322.58203125, - "loss": 0.5023, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.16795411705970764, - "rewards/margins": 2.0571203231811523, - "rewards/rejected": -2.225074291229248, + "logits/chosen": -2.799834728240967, + "logits/rejected": -2.801335334777832, + "logps/chosen": -217.779052734375, + "logps/rejected": -260.1390075683594, + "loss": 0.3746, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.037664007395505905, + "rewards/margins": 4.134084701538086, + "rewards/rejected": -4.096421241760254, "step": 1250 }, { "epoch": 0.65, "learning_rate": 4.3516924842226045e-07, - "logits/chosen": -2.861953020095825, - "logits/rejected": -2.931356430053711, - "logps/chosen": -271.10101318359375, - "logps/rejected": -307.50628662109375, - "loss": 0.4281, + "logits/chosen": -2.7819128036499023, + "logits/rejected": -2.819404363632202, + "logps/chosen": -269.98199462890625, + "logps/rejected": -260.01336669921875, + "loss": 0.45, "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.07305797189474106, - "rewards/margins": 2.9939346313476562, - "rewards/rejected": -2.9208762645721436, + "rewards/chosen": 0.18496528267860413, + "rewards/margins": 4.298933506011963, + "rewards/rejected": -4.1139678955078125, "step": 1260 }, { "epoch": 0.66, "learning_rate": 4.3421304264677757e-07, - "logits/chosen": -2.904646396636963, - "logits/rejected": -2.9532904624938965, - "logps/chosen": -249.11837768554688, - "logps/rejected": -269.0419921875, - "loss": 0.393, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.16748471558094025, - "rewards/margins": 2.746199131011963, - "rewards/rejected": -2.578714370727539, + "logits/chosen": -2.820605754852295, + "logits/rejected": -2.8283324241638184, + "logps/chosen": -248.30221557617188, + "logps/rejected": -219.8176727294922, + "loss": 0.324, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.2461501806974411, + "rewards/margins": 4.795166015625, + "rewards/rejected": -4.54901647567749, "step": 1270 }, { "epoch": 0.66, "learning_rate": 4.332568368712947e-07, - "logits/chosen": -2.971287250518799, - "logits/rejected": -3.0082404613494873, - "logps/chosen": -250.14260864257812, - "logps/rejected": -292.34637451171875, - "loss": 0.4698, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.24356532096862793, - "rewards/margins": 2.4449563026428223, - "rewards/rejected": -2.6885218620300293, + "logits/chosen": -2.9526381492614746, + "logits/rejected": -2.921189546585083, + "logps/chosen": -246.56838989257812, + "logps/rejected": -262.57672119140625, + "loss": 0.4148, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.11385713517665863, + "rewards/margins": 4.110852241516113, + "rewards/rejected": -3.996994733810425, "step": 1280 }, { "epoch": 0.67, "learning_rate": 4.323006310958118e-07, - "logits/chosen": -2.863560676574707, - "logits/rejected": -2.9776759147644043, - "logps/chosen": -264.33233642578125, - "logps/rejected": -316.8869323730469, - "loss": 0.4937, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.3103228807449341, - "rewards/margins": 3.370903491973877, - "rewards/rejected": -3.0605804920196533, + "logits/chosen": -2.8807690143585205, + "logits/rejected": -2.9343276023864746, + "logps/chosen": -261.0328674316406, + "logps/rejected": -278.9743957519531, + "loss": 0.3943, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.6419084668159485, + "rewards/margins": 4.750491619110107, + "rewards/rejected": -4.108582973480225, "step": 1290 }, { "epoch": 0.67, "learning_rate": 4.313444253203289e-07, - "logits/chosen": -2.913517713546753, - "logits/rejected": -2.9551703929901123, - "logps/chosen": -270.5498962402344, - "logps/rejected": -278.1402282714844, - "loss": 0.5218, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.27814558148384094, - "rewards/margins": 2.525500535964966, - "rewards/rejected": -2.2473549842834473, + "logits/chosen": -2.9321370124816895, + "logits/rejected": -2.9273550510406494, + "logps/chosen": -271.53045654296875, + "logps/rejected": -213.25009155273438, + "loss": 0.3265, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.17942379415035248, + "rewards/margins": 5.026391506195068, + "rewards/rejected": -4.8469672203063965, "step": 1300 }, { "epoch": 0.67, - "eval_logits/chosen": -2.886589288711548, - "eval_logits/rejected": -2.9536550045013428, - "eval_logps/chosen": -242.95413208007812, - "eval_logps/rejected": -299.2200622558594, - "eval_loss": 0.49863824248313904, - "eval_rewards/accuracies": 0.7960000038146973, - "eval_rewards/chosen": 0.4726015031337738, - "eval_rewards/margins": 2.7808680534362793, - "eval_rewards/rejected": -2.3082668781280518, - "eval_runtime": 278.6877, - "eval_samples_per_second": 7.176, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.8869292736053467, + "eval_logits/rejected": -2.918574810028076, + "eval_logps/chosen": -240.5668182373047, + "eval_logps/rejected": -266.2407531738281, + "eval_loss": 0.333926796913147, + "eval_rewards/accuracies": 0.8500000238418579, + "eval_rewards/chosen": 0.7247689962387085, + "eval_rewards/margins": 4.7754597663879395, + "eval_rewards/rejected": -4.0506911277771, + "eval_runtime": 276.3497, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 0.452, "step": 1300 }, { "epoch": 0.68, "learning_rate": 4.3038821954484603e-07, - "logits/chosen": -2.930354356765747, - "logits/rejected": -2.9470126628875732, - "logps/chosen": -264.21075439453125, - "logps/rejected": -299.3939514160156, - "loss": 0.434, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.06842346489429474, - "rewards/margins": 2.0977582931518555, - "rewards/rejected": -2.029334545135498, + "logits/chosen": -2.9732444286346436, + "logits/rejected": -2.891091823577881, + "logps/chosen": -261.42169189453125, + "logps/rejected": -217.8062286376953, + "loss": 0.2745, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.3565501570701599, + "rewards/margins": 5.2472686767578125, + "rewards/rejected": -4.89071798324585, "step": 1310 }, { "epoch": 0.68, "learning_rate": 4.2943201376936315e-07, - "logits/chosen": -2.879948377609253, - "logits/rejected": -2.91914439201355, - "logps/chosen": -240.50375366210938, - "logps/rejected": -293.88970947265625, - "loss": 0.4489, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.028376024216413498, - "rewards/margins": 2.692608594894409, - "rewards/rejected": -2.6642327308654785, + "logits/chosen": -2.924577474594116, + "logits/rejected": -2.9318995475769043, + "logps/chosen": -238.8622589111328, + "logps/rejected": -283.12542724609375, + "loss": 0.3242, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.1925247758626938, + "rewards/margins": 4.491854667663574, + "rewards/rejected": -4.29932975769043, "step": 1320 }, { "epoch": 0.69, "learning_rate": 4.2847580799388026e-07, - "logits/chosen": -2.8353028297424316, - "logits/rejected": -2.896289348602295, - "logps/chosen": -263.8857421875, - "logps/rejected": -333.9789123535156, - "loss": 0.4939, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.7538554072380066, - "rewards/margins": 2.6621108055114746, - "rewards/rejected": -1.9082553386688232, + "logits/chosen": -2.8703885078430176, + "logits/rejected": -2.890921115875244, + "logps/chosen": -263.49090576171875, + "logps/rejected": -308.0920104980469, + "loss": 0.3466, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7952240705490112, + "rewards/margins": 4.85191011428833, + "rewards/rejected": -4.056685924530029, "step": 1330 }, { "epoch": 0.69, "learning_rate": 4.275196022183974e-07, - "logits/chosen": -2.7950098514556885, - "logits/rejected": -2.842745780944824, - "logps/chosen": -229.3527069091797, - "logps/rejected": -267.72998046875, - "loss": 0.5018, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.5895912051200867, - "rewards/margins": 2.599395513534546, - "rewards/rejected": -2.0098042488098145, + "logits/chosen": -2.8670029640197754, + "logits/rejected": -2.842885971069336, + "logps/chosen": -228.1724395751953, + "logps/rejected": -240.37753295898438, + "loss": 0.339, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.7076202034950256, + "rewards/margins": 4.393362045288086, + "rewards/rejected": -3.685741901397705, "step": 1340 }, { "epoch": 0.7, "learning_rate": 4.265633964429145e-07, - "logits/chosen": -2.8519222736358643, - "logits/rejected": -2.9202880859375, - "logps/chosen": -193.359375, - "logps/rejected": -252.4734344482422, - "loss": 0.5209, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.6954705119132996, - "rewards/margins": 2.9130749702453613, - "rewards/rejected": -2.217604398727417, + "logits/chosen": -2.910736322402954, + "logits/rejected": -2.950467586517334, + "logps/chosen": -192.33261108398438, + "logps/rejected": -228.1023406982422, + "loss": 0.352, + "rewards/accuracies": 0.8125, + "rewards/chosen": 1.0341860055923462, + "rewards/margins": 4.925230979919434, + "rewards/rejected": -3.891045331954956, "step": 1350 }, { "epoch": 0.7, "learning_rate": 4.256071906674316e-07, - "logits/chosen": -2.8548550605773926, - "logits/rejected": -2.936178684234619, - "logps/chosen": -321.51763916015625, - "logps/rejected": -315.2483825683594, - "loss": 0.563, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.30997490882873535, - "rewards/margins": 2.0616490840911865, - "rewards/rejected": -1.7516740560531616, + "logits/chosen": -2.9050211906433105, + "logits/rejected": -2.9435229301452637, + "logps/chosen": -316.89727783203125, + "logps/rejected": -264.8079528808594, + "loss": 0.4661, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.9692543745040894, + "rewards/margins": 4.720278263092041, + "rewards/rejected": -3.751023530960083, "step": 1360 }, { "epoch": 0.71, "learning_rate": 4.246509848919487e-07, - "logits/chosen": -2.912834882736206, - "logits/rejected": -3.0000414848327637, - "logps/chosen": -203.55709838867188, - "logps/rejected": -300.4374084472656, - "loss": 0.5563, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.46916669607162476, - "rewards/margins": 2.8837640285491943, - "rewards/rejected": -2.414597511291504, + "logits/chosen": -2.945949077606201, + "logits/rejected": -2.9557244777679443, + "logps/chosen": -203.13511657714844, + "logps/rejected": -263.85296630859375, + "loss": 0.422, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.51136714220047, + "rewards/margins": 5.333380222320557, + "rewards/rejected": -4.822012901306152, "step": 1370 }, { "epoch": 0.71, "learning_rate": 4.2369477911646584e-07, - "logits/chosen": -2.943681478500366, - "logits/rejected": -3.013051986694336, - "logps/chosen": -218.1947784423828, - "logps/rejected": -263.2126159667969, - "loss": 0.4797, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.47852277755737305, - "rewards/margins": 2.767585277557373, - "rewards/rejected": -2.289062261581421, + "logits/chosen": -2.9749226570129395, + "logits/rejected": -2.979949474334717, + "logps/chosen": -216.938232421875, + "logps/rejected": -234.1802520751953, + "loss": 0.3766, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.6056711077690125, + "rewards/margins": 4.949410438537598, + "rewards/rejected": -4.3437395095825195, "step": 1380 }, { "epoch": 0.72, "learning_rate": 4.2273857334098296e-07, - "logits/chosen": -2.8952932357788086, - "logits/rejected": -2.9491114616394043, - "logps/chosen": -295.91815185546875, - "logps/rejected": -293.3535461425781, - "loss": 0.5053, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.44421887397766113, - "rewards/margins": 2.2546558380126953, - "rewards/rejected": -1.8104368448257446, + "logits/chosen": -2.916769504547119, + "logits/rejected": -2.9068217277526855, + "logps/chosen": -292.4642028808594, + "logps/rejected": -252.4457244873047, + "loss": 0.3837, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.8129030466079712, + "rewards/margins": 4.1942901611328125, + "rewards/rejected": -3.3813865184783936, "step": 1390 }, { "epoch": 0.72, "learning_rate": 4.2178236756550007e-07, - "logits/chosen": -2.904174566268921, - "logits/rejected": -2.96537184715271, - "logps/chosen": -301.59210205078125, - "logps/rejected": -297.4707946777344, - "loss": 0.6129, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.25321847200393677, - "rewards/margins": 2.3061304092407227, - "rewards/rejected": -2.0529122352600098, + "logits/chosen": -2.896195650100708, + "logits/rejected": -2.90677547454834, + "logps/chosen": -302.37481689453125, + "logps/rejected": -260.61468505859375, + "loss": 0.4276, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.1749468743801117, + "rewards/margins": 4.319541931152344, + "rewards/rejected": -4.144595146179199, "step": 1400 }, { "epoch": 0.72, - "eval_logits/chosen": -2.9437789916992188, - "eval_logits/rejected": -3.0072262287139893, - "eval_logps/chosen": -242.10218811035156, - "eval_logps/rejected": -298.38385009765625, - "eval_loss": 0.4817677140235901, - "eval_rewards/accuracies": 0.8080000281333923, - "eval_rewards/chosen": 0.5577969551086426, - "eval_rewards/margins": 2.7824389934539795, - "eval_rewards/rejected": -2.224642038345337, - "eval_runtime": 279.044, - "eval_samples_per_second": 7.167, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.904446840286255, + "eval_logits/rejected": -2.9353818893432617, + "eval_logps/chosen": -241.9300537109375, + "eval_logps/rejected": -268.18341064453125, + "eval_loss": 0.3391246199607849, + "eval_rewards/accuracies": 0.843999981880188, + "eval_rewards/chosen": 0.5884455442428589, + "eval_rewards/margins": 4.833401679992676, + "eval_rewards/rejected": -4.2449564933776855, + "eval_runtime": 276.7555, + "eval_samples_per_second": 7.227, + "eval_steps_per_second": 0.452, "step": 1400 }, { "epoch": 0.73, "learning_rate": 4.208261617900172e-07, - "logits/chosen": -2.973268747329712, - "logits/rejected": -3.0276944637298584, - "logps/chosen": -262.35491943359375, - "logps/rejected": -244.64102172851562, - "loss": 0.4874, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.5497137904167175, - "rewards/margins": 2.7235116958618164, - "rewards/rejected": -2.173797607421875, + "logits/chosen": -2.9520788192749023, + "logits/rejected": -2.957977294921875, + "logps/chosen": -262.1817321777344, + "logps/rejected": -222.43624877929688, + "loss": 0.4176, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.5668151378631592, + "rewards/margins": 4.916453838348389, + "rewards/rejected": -4.349638938903809, "step": 1410 }, { "epoch": 0.73, "learning_rate": 4.198699560145343e-07, - "logits/chosen": -2.862189292907715, - "logits/rejected": -2.9037973880767822, - "logps/chosen": -240.8940887451172, - "logps/rejected": -295.91339111328125, - "loss": 0.6516, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.6252197623252869, - "rewards/margins": 2.2482056617736816, - "rewards/rejected": -1.6229861974716187, + "logits/chosen": -2.778014659881592, + "logits/rejected": -2.8117594718933105, + "logps/chosen": -239.83291625976562, + "logps/rejected": -252.0980987548828, + "loss": 0.3814, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.7317675352096558, + "rewards/margins": 5.216647148132324, + "rewards/rejected": -4.484879016876221, "step": 1420 }, { "epoch": 0.74, "learning_rate": 4.189137502390514e-07, - "logits/chosen": -2.931879997253418, - "logits/rejected": -2.965914487838745, - "logps/chosen": -259.8681335449219, - "logps/rejected": -316.9034729003906, - "loss": 0.5662, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.13505953550338745, - "rewards/margins": 2.2420730590820312, - "rewards/rejected": -2.107013702392578, + "logits/chosen": -2.93105149269104, + "logits/rejected": -2.8723952770233154, + "logps/chosen": -260.08563232421875, + "logps/rejected": -284.1394348144531, + "loss": 0.3301, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5464638471603394, + "rewards/margins": 4.4239912033081055, + "rewards/rejected": -3.8775277137756348, "step": 1430 }, { "epoch": 0.74, "learning_rate": 4.179575444635686e-07, - "logits/chosen": -2.9142744541168213, - "logits/rejected": -2.95097279548645, - "logps/chosen": -305.0422668457031, - "logps/rejected": -317.8971252441406, - "loss": 0.4804, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.6268448233604431, - "rewards/margins": 2.3850245475769043, - "rewards/rejected": -1.7581799030303955, + "logits/chosen": -2.921387195587158, + "logits/rejected": -2.905458688735962, + "logps/chosen": -304.4259033203125, + "logps/rejected": -284.3158264160156, + "loss": 0.3347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.6884804964065552, + "rewards/margins": 4.329853057861328, + "rewards/rejected": -3.6413726806640625, "step": 1440 }, { "epoch": 0.75, "learning_rate": 4.170013386880857e-07, - "logits/chosen": -2.8860132694244385, - "logits/rejected": -2.9283335208892822, - "logps/chosen": -219.405029296875, - "logps/rejected": -327.70123291015625, - "loss": 0.4487, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6366171836853027, - "rewards/margins": 2.8044886589050293, - "rewards/rejected": -2.1678714752197266, + "logits/chosen": -2.903087854385376, + "logits/rejected": -2.9174695014953613, + "logps/chosen": -218.29867553710938, + "logps/rejected": -300.3997802734375, + "loss": 0.3382, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7464351058006287, + "rewards/margins": 4.849874973297119, + "rewards/rejected": -4.103440284729004, "step": 1450 }, { "epoch": 0.75, "learning_rate": 4.1604513291260277e-07, - "logits/chosen": -2.8814711570739746, - "logits/rejected": -2.9225306510925293, - "logps/chosen": -218.7816162109375, - "logps/rejected": -295.43267822265625, - "loss": 0.5484, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.5335996150970459, - "rewards/margins": 2.8809235095977783, - "rewards/rejected": -2.3473238945007324, + "logits/chosen": -2.940701961517334, + "logits/rejected": -2.8996694087982178, + "logps/chosen": -216.47573852539062, + "logps/rejected": -272.1661376953125, + "loss": 0.4875, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.7628965377807617, + "rewards/margins": 5.872712135314941, + "rewards/rejected": -5.109816074371338, "step": 1460 }, { "epoch": 0.76, "learning_rate": 4.150889271371199e-07, - "logits/chosen": -2.804237127304077, - "logits/rejected": -2.8750622272491455, - "logps/chosen": -279.5050964355469, - "logps/rejected": -276.02874755859375, - "loss": 0.5363, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.5062381625175476, - "rewards/margins": 3.0678353309631348, - "rewards/rejected": -2.5615968704223633, + "logits/chosen": -2.850949764251709, + "logits/rejected": -2.838317394256592, + "logps/chosen": -278.03387451171875, + "logps/rejected": -257.4394836425781, + "loss": 0.3889, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.6531850695610046, + "rewards/margins": 4.759385108947754, + "rewards/rejected": -4.106200218200684, "step": 1470 }, { "epoch": 0.76, "learning_rate": 4.14132721361637e-07, - "logits/chosen": -2.817847728729248, - "logits/rejected": -2.8680078983306885, - "logps/chosen": -246.18038940429688, - "logps/rejected": -220.83285522460938, - "loss": 0.3628, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.5915089249610901, - "rewards/margins": 3.000457286834717, - "rewards/rejected": -2.4089484214782715, + "logits/chosen": -2.8649725914001465, + "logits/rejected": -2.827401876449585, + "logps/chosen": -244.4897003173828, + "logps/rejected": -195.82089233398438, + "loss": 0.276, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.7605772018432617, + "rewards/margins": 5.574331283569336, + "rewards/rejected": -4.813754081726074, "step": 1480 }, { "epoch": 0.77, "learning_rate": 4.131765155861541e-07, - "logits/chosen": -2.776465892791748, - "logits/rejected": -2.858459949493408, - "logps/chosen": -230.6031494140625, - "logps/rejected": -250.45510864257812, - "loss": 0.4606, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.39864081144332886, - "rewards/margins": 3.0935165882110596, - "rewards/rejected": -2.694875478744507, + "logits/chosen": -2.8088066577911377, + "logits/rejected": -2.813155174255371, + "logps/chosen": -228.7838592529297, + "logps/rejected": -208.28897094726562, + "loss": 0.3024, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 1.0388944149017334, + "rewards/margins": 5.378073692321777, + "rewards/rejected": -4.339179992675781, "step": 1490 }, { "epoch": 0.77, "learning_rate": 4.1222030981067123e-07, - "logits/chosen": -2.847076654434204, - "logits/rejected": -2.8960652351379395, - "logps/chosen": -267.3013916015625, - "logps/rejected": -305.4517822265625, - "loss": 0.3862, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.035213030874729156, - "rewards/margins": 2.688610076904297, - "rewards/rejected": -2.6533970832824707, + "logits/chosen": -2.887725353240967, + "logits/rejected": -2.8777480125427246, + "logps/chosen": -262.41973876953125, + "logps/rejected": -271.9168395996094, + "loss": 0.3512, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.5233832597732544, + "rewards/margins": 4.933677673339844, + "rewards/rejected": -4.410295009613037, "step": 1500 }, { "epoch": 0.77, - "eval_logits/chosen": -2.8354265689849854, - "eval_logits/rejected": -2.897573947906494, - "eval_logps/chosen": -244.42625427246094, - "eval_logps/rejected": -302.6622009277344, - "eval_loss": 0.4689019024372101, - "eval_rewards/accuracies": 0.8140000104904175, - "eval_rewards/chosen": 0.32539257407188416, - "eval_rewards/margins": 2.97786808013916, - "eval_rewards/rejected": -2.652475595474243, - "eval_runtime": 278.9347, - "eval_samples_per_second": 7.17, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.8697025775909424, + "eval_logits/rejected": -2.9077866077423096, + "eval_logps/chosen": -239.0665283203125, + "eval_logps/rejected": -267.7491760253906, + "eval_loss": 0.35968995094299316, + "eval_rewards/accuracies": 0.8360000252723694, + "eval_rewards/chosen": 0.8747963905334473, + "eval_rewards/margins": 5.076330661773682, + "eval_rewards/rejected": -4.201534748077393, + "eval_runtime": 276.1438, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 0.453, "step": 1500 }, { "epoch": 0.78, "learning_rate": 4.1126410403518835e-07, - "logits/chosen": -2.8334925174713135, - "logits/rejected": -2.8591761589050293, - "logps/chosen": -221.348876953125, - "logps/rejected": -311.17877197265625, - "loss": 0.4741, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.015551751479506493, - "rewards/margins": 2.4997267723083496, - "rewards/rejected": -2.484175443649292, + "logits/chosen": -2.8690896034240723, + "logits/rejected": -2.8833861351013184, + "logps/chosen": -215.3618927001953, + "logps/rejected": -281.7811584472656, + "loss": 0.3217, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.6137517690658569, + "rewards/margins": 6.23096227645874, + "rewards/rejected": -5.617210865020752, "step": 1510 }, { "epoch": 0.78, "learning_rate": 4.1030789825970546e-07, - "logits/chosen": -2.754701852798462, - "logits/rejected": -2.8493895530700684, - "logps/chosen": -275.2632751464844, - "logps/rejected": -338.8531494140625, - "loss": 0.4382, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.5648075342178345, - "rewards/margins": 3.1909804344177246, - "rewards/rejected": -2.6261725425720215, + "logits/chosen": -2.7732720375061035, + "logits/rejected": -2.7872493267059326, + "logps/chosen": -264.47613525390625, + "logps/rejected": -279.06744384765625, + "loss": 0.2445, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.643922209739685, + "rewards/margins": 6.015587329864502, + "rewards/rejected": -4.371665000915527, "step": 1520 }, { "epoch": 0.79, "learning_rate": 4.093516924842226e-07, - "logits/chosen": -2.797532558441162, - "logits/rejected": -2.8553106784820557, - "logps/chosen": -282.1953125, - "logps/rejected": -335.3280334472656, - "loss": 0.3387, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.3751303553581238, - "rewards/margins": 3.3121883869171143, - "rewards/rejected": -2.9370579719543457, + "logits/chosen": -2.805691957473755, + "logits/rejected": -2.8382484912872314, + "logps/chosen": -274.9083251953125, + "logps/rejected": -290.3618469238281, + "loss": 0.2752, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1050134897232056, + "rewards/margins": 5.222965240478516, + "rewards/rejected": -4.1179518699646, "step": 1530 }, { "epoch": 0.8, "learning_rate": 4.083954867087397e-07, - "logits/chosen": -2.895432710647583, - "logits/rejected": -2.9414145946502686, - "logps/chosen": -220.44735717773438, - "logps/rejected": -296.5469970703125, - "loss": 0.5896, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.16891750693321228, - "rewards/margins": 2.6762678623199463, - "rewards/rejected": -2.507350444793701, + "logits/chosen": -2.943225860595703, + "logits/rejected": -2.948715925216675, + "logps/chosen": -216.09475708007812, + "logps/rejected": -266.1579895019531, + "loss": 0.3208, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.603803813457489, + "rewards/margins": 4.862678527832031, + "rewards/rejected": -4.258874893188477, "step": 1540 }, { "epoch": 0.8, "learning_rate": 4.074392809332568e-07, - "logits/chosen": -2.8410191535949707, - "logits/rejected": -2.9066405296325684, - "logps/chosen": -316.89971923828125, - "logps/rejected": -309.38934326171875, - "loss": 0.4013, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.8427516222000122, - "rewards/margins": 3.9204134941101074, - "rewards/rejected": -3.0776619911193848, + "logits/chosen": -2.870087146759033, + "logits/rejected": -2.904085636138916, + "logps/chosen": -317.5494689941406, + "logps/rejected": -282.88446044921875, + "loss": 0.2744, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.7777743339538574, + "rewards/margins": 5.584456443786621, + "rewards/rejected": -4.8066816329956055, "step": 1550 }, { "epoch": 0.81, "learning_rate": 4.064830751577739e-07, - "logits/chosen": -2.889589548110962, - "logits/rejected": -2.917020320892334, - "logps/chosen": -243.4337921142578, - "logps/rejected": -297.63690185546875, - "loss": 0.4869, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.4400361478328705, - "rewards/margins": 2.95405912399292, - "rewards/rejected": -2.5140228271484375, + "logits/chosen": -2.8875174522399902, + "logits/rejected": -2.8976080417633057, + "logps/chosen": -241.87255859375, + "logps/rejected": -250.11767578125, + "loss": 0.3667, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.5945970416069031, + "rewards/margins": 5.334896564483643, + "rewards/rejected": -4.740299224853516, "step": 1560 }, { "epoch": 0.81, "learning_rate": 4.0552686938229104e-07, - "logits/chosen": -2.902555227279663, - "logits/rejected": -2.956498384475708, - "logps/chosen": -219.6063232421875, - "logps/rejected": -291.71685791015625, - "loss": 0.4151, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.315045028924942, - "rewards/margins": 2.819603204727173, - "rewards/rejected": -2.5045580863952637, + "logits/chosen": -2.879173994064331, + "logits/rejected": -2.9246304035186768, + "logps/chosen": -216.4102325439453, + "logps/rejected": -267.7967224121094, + "loss": 0.3402, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.636400580406189, + "rewards/margins": 4.764558792114258, + "rewards/rejected": -4.128157615661621, "step": 1570 }, { "epoch": 0.82, "learning_rate": 4.045706636068082e-07, - "logits/chosen": -2.898637056350708, - "logits/rejected": -2.9533817768096924, - "logps/chosen": -252.69180297851562, - "logps/rejected": -284.62744140625, - "loss": 0.4136, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.10596165806055069, - "rewards/margins": 3.081064462661743, - "rewards/rejected": -3.187026262283325, + "logits/chosen": -2.9266340732574463, + "logits/rejected": -2.897193431854248, + "logps/chosen": -247.1452178955078, + "logps/rejected": -230.5690155029297, + "loss": 0.3266, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.4496341347694397, + "rewards/margins": 5.153058052062988, + "rewards/rejected": -4.703423500061035, "step": 1580 }, { "epoch": 0.82, "learning_rate": 4.036144578313253e-07, - "logits/chosen": -2.8751070499420166, - "logits/rejected": -2.9103922843933105, - "logps/chosen": -226.5676727294922, - "logps/rejected": -296.97467041015625, - "loss": 0.4177, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.16602222621440887, - "rewards/margins": 2.9820897579193115, - "rewards/rejected": -2.8160674571990967, + "logits/chosen": -2.8789584636688232, + "logits/rejected": -2.839329481124878, + "logps/chosen": -221.5947265625, + "logps/rejected": -235.40768432617188, + "loss": 0.2673, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.6611341834068298, + "rewards/margins": 5.608819007873535, + "rewards/rejected": -4.947685241699219, "step": 1590 }, { "epoch": 0.83, "learning_rate": 4.0265825205584244e-07, - "logits/chosen": -2.88401460647583, - "logits/rejected": -2.9255869388580322, - "logps/chosen": -281.10113525390625, - "logps/rejected": -315.5570068359375, - "loss": 0.4186, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": -0.1843784749507904, - "rewards/margins": 2.7065744400024414, - "rewards/rejected": -2.8909528255462646, + "logits/chosen": -2.9016590118408203, + "logits/rejected": -2.9206817150115967, + "logps/chosen": -274.60302734375, + "logps/rejected": -294.82879638671875, + "loss": 0.3429, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.46543407440185547, + "rewards/margins": 4.490087032318115, + "rewards/rejected": -4.024653434753418, "step": 1600 }, { "epoch": 0.83, - "eval_logits/chosen": -2.8588759899139404, - "eval_logits/rejected": -2.920731544494629, - "eval_logps/chosen": -244.61883544921875, - "eval_logps/rejected": -305.6510925292969, - "eval_loss": 0.4497062563896179, - "eval_rewards/accuracies": 0.8040000200271606, - "eval_rewards/chosen": 0.3061320185661316, - "eval_rewards/margins": 3.2575008869171143, - "eval_rewards/rejected": -2.951368570327759, - "eval_runtime": 279.1026, - "eval_samples_per_second": 7.166, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.8661489486694336, + "eval_logits/rejected": -2.9040777683258057, + "eval_logps/chosen": -242.1075439453125, + "eval_logps/rejected": -270.0566101074219, + "eval_loss": 0.3303963840007782, + "eval_rewards/accuracies": 0.8360000252723694, + "eval_rewards/chosen": 0.5706946849822998, + "eval_rewards/margins": 5.002972602844238, + "eval_rewards/rejected": -4.432278156280518, + "eval_runtime": 276.6122, + "eval_samples_per_second": 7.23, + "eval_steps_per_second": 0.452, "step": 1600 }, { "epoch": 0.83, "learning_rate": 4.0170204628035956e-07, - "logits/chosen": -2.9068946838378906, - "logits/rejected": -2.9520983695983887, - "logps/chosen": -196.4069061279297, - "logps/rejected": -263.8167419433594, - "loss": 0.4258, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.18958450853824615, - "rewards/margins": 3.1812098026275635, - "rewards/rejected": -2.9916253089904785, + "logits/chosen": -2.932310104370117, + "logits/rejected": -2.9477579593658447, + "logps/chosen": -194.57754516601562, + "logps/rejected": -243.3104705810547, + "loss": 0.305, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.3725212812423706, + "rewards/margins": 4.596383571624756, + "rewards/rejected": -4.223862171173096, "step": 1610 }, { "epoch": 0.84, "learning_rate": 4.007458405048766e-07, - "logits/chosen": -2.880481243133545, - "logits/rejected": -2.923464298248291, - "logps/chosen": -273.96710205078125, - "logps/rejected": -340.50286865234375, - "loss": 0.4167, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.3789766728878021, - "rewards/margins": 3.0592124462127686, - "rewards/rejected": -2.6802358627319336, + "logits/chosen": -2.9013373851776123, + "logits/rejected": -2.914280891418457, + "logps/chosen": -274.95489501953125, + "logps/rejected": -291.0045471191406, + "loss": 0.358, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.46183714270591736, + "rewards/margins": 5.023828983306885, + "rewards/rejected": -4.5619916915893555, "step": 1620 }, { "epoch": 0.84, "learning_rate": 3.9978963472939373e-07, - "logits/chosen": -2.9000725746154785, - "logits/rejected": -2.9560017585754395, - "logps/chosen": -260.95379638671875, - "logps/rejected": -287.95977783203125, - "loss": 0.4137, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.29968172311782837, - "rewards/margins": 3.1619606018066406, - "rewards/rejected": -2.862278938293457, + "logits/chosen": -2.9114856719970703, + "logits/rejected": -2.9097037315368652, + "logps/chosen": -260.0205993652344, + "logps/rejected": -255.8209228515625, + "loss": 0.2783, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.3929978907108307, + "rewards/margins": 5.116772651672363, + "rewards/rejected": -4.723774433135986, "step": 1630 }, { "epoch": 0.85, "learning_rate": 3.9883342895391085e-07, - "logits/chosen": -2.937865734100342, - "logits/rejected": -2.9621806144714355, - "logps/chosen": -268.20404052734375, - "logps/rejected": -301.2460021972656, - "loss": 0.4676, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.061094582080841064, - "rewards/margins": 3.1974027156829834, - "rewards/rejected": -3.136308193206787, + "logits/chosen": -2.910548686981201, + "logits/rejected": -2.9094414710998535, + "logps/chosen": -264.4458923339844, + "logps/rejected": -284.4587707519531, + "loss": 0.3775, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4372238218784332, + "rewards/margins": 4.973487854003906, + "rewards/rejected": -4.536264419555664, "step": 1640 }, { "epoch": 0.85, "learning_rate": 3.9787722317842796e-07, - "logits/chosen": -2.9573769569396973, - "logits/rejected": -3.0099475383758545, - "logps/chosen": -239.37808227539062, - "logps/rejected": -249.80813598632812, - "loss": 0.444, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.42435941100120544, - "rewards/margins": 3.010646104812622, - "rewards/rejected": -2.586287021636963, + "logits/chosen": -2.8906562328338623, + "logits/rejected": -2.897914171218872, + "logps/chosen": -238.0529022216797, + "logps/rejected": -218.82235717773438, + "loss": 0.2894, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.5568763017654419, + "rewards/margins": 5.395173072814941, + "rewards/rejected": -4.838296890258789, "step": 1650 }, { "epoch": 0.86, "learning_rate": 3.969210174029451e-07, - "logits/chosen": -2.910501003265381, - "logits/rejected": -2.9044454097747803, - "logps/chosen": -292.47503662109375, - "logps/rejected": -347.2369384765625, - "loss": 0.4663, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.30727043747901917, - "rewards/margins": 2.7217769622802734, - "rewards/rejected": -2.414506435394287, + "logits/chosen": -2.856729745864868, + "logits/rejected": -2.844703197479248, + "logps/chosen": -291.847900390625, + "logps/rejected": -305.67083740234375, + "loss": 0.4431, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.37129339575767517, + "rewards/margins": 4.475009918212891, + "rewards/rejected": -4.103716850280762, "step": 1660 }, { "epoch": 0.86, "learning_rate": 3.959648116274622e-07, - "logits/chosen": -2.8908803462982178, - "logits/rejected": -2.9170987606048584, - "logps/chosen": -234.23165893554688, - "logps/rejected": -281.02850341796875, - "loss": 0.5076, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.2927956283092499, - "rewards/margins": 3.0067875385284424, - "rewards/rejected": -2.71399188041687, + "logits/chosen": -2.8652291297912598, + "logits/rejected": -2.881310224533081, + "logps/chosen": -229.58566284179688, + "logps/rejected": -250.58718872070312, + "loss": 0.3771, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.7573938369750977, + "rewards/margins": 5.851351261138916, + "rewards/rejected": -5.093957424163818, "step": 1670 }, { "epoch": 0.87, "learning_rate": 3.950086058519793e-07, - "logits/chosen": -2.9678280353546143, - "logits/rejected": -3.010523796081543, - "logps/chosen": -209.88211059570312, - "logps/rejected": -288.1944274902344, - "loss": 0.5021, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": -0.3828034996986389, - "rewards/margins": 2.6173367500305176, - "rewards/rejected": -3.000140428543091, + "logits/chosen": -2.894726514816284, + "logits/rejected": -2.941082000732422, + "logps/chosen": -202.55252075195312, + "logps/rejected": -254.87265014648438, + "loss": 0.3549, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.3518727719783783, + "rewards/margins": 4.305854320526123, + "rewards/rejected": -3.9539809226989746, "step": 1680 }, { "epoch": 0.87, "learning_rate": 3.9405240007649643e-07, - "logits/chosen": -2.9930944442749023, - "logits/rejected": -3.051088333129883, - "logps/chosen": -218.7351531982422, - "logps/rejected": -301.77264404296875, - "loss": 0.4626, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.22515520453453064, - "rewards/margins": 2.75825834274292, - "rewards/rejected": -2.9834134578704834, + "logits/chosen": -2.9519753456115723, + "logits/rejected": -2.9872474670410156, + "logps/chosen": -217.9387969970703, + "logps/rejected": -274.7255554199219, + "loss": 0.3396, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.03846340253949165, + "rewards/margins": 3.8450846672058105, + "rewards/rejected": -3.8066210746765137, "step": 1690 }, { "epoch": 0.88, "learning_rate": 3.9309619430101354e-07, - "logits/chosen": -2.917217493057251, - "logits/rejected": -2.9686226844787598, - "logps/chosen": -261.61724853515625, - "logps/rejected": -286.07110595703125, - "loss": 0.4765, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.12368907034397125, - "rewards/margins": 2.886246919631958, - "rewards/rejected": -2.7625582218170166, + "logits/chosen": -2.853994131088257, + "logits/rejected": -2.8708691596984863, + "logps/chosen": -259.486328125, + "logps/rejected": -260.5020446777344, + "loss": 0.4142, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.37569689750671387, + "rewards/margins": 4.624277591705322, + "rewards/rejected": -4.248580455780029, "step": 1700 }, { "epoch": 0.88, - "eval_logits/chosen": -2.924076557159424, - "eval_logits/rejected": -2.9836485385894775, - "eval_logps/chosen": -243.892578125, - "eval_logps/rejected": -302.3619384765625, - "eval_loss": 0.42963850498199463, - "eval_rewards/accuracies": 0.8059999942779541, - "eval_rewards/chosen": 0.3787572979927063, - "eval_rewards/margins": 3.001209020614624, - "eval_rewards/rejected": -2.6224520206451416, - "eval_runtime": 278.6405, - "eval_samples_per_second": 7.178, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.8457822799682617, + "eval_logits/rejected": -2.883913993835449, + "eval_logps/chosen": -240.87350463867188, + "eval_logps/rejected": -266.1581115722656, + "eval_loss": 0.32409653067588806, + "eval_rewards/accuracies": 0.8500000238418579, + "eval_rewards/chosen": 0.6940996050834656, + "eval_rewards/margins": 4.73652982711792, + "eval_rewards/rejected": -4.042430400848389, + "eval_runtime": 276.0458, + "eval_samples_per_second": 7.245, + "eval_steps_per_second": 0.453, "step": 1700 }, { "epoch": 0.88, "learning_rate": 3.9213998852553066e-07, - "logits/chosen": -2.944732904434204, - "logits/rejected": -2.988416910171509, - "logps/chosen": -319.26422119140625, - "logps/rejected": -304.89996337890625, - "loss": 0.4007, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.012713325209915638, - "rewards/margins": 2.6510703563690186, - "rewards/rejected": -2.638357162475586, + "logits/chosen": -2.883554220199585, + "logits/rejected": -2.87931489944458, + "logps/chosen": -312.7303161621094, + "logps/rejected": -256.54644775390625, + "loss": 0.3359, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6661099791526794, + "rewards/margins": 5.2111616134643555, + "rewards/rejected": -4.545051097869873, "step": 1710 }, { "epoch": 0.89, "learning_rate": 3.9118378275004783e-07, - "logits/chosen": -2.9039697647094727, - "logits/rejected": -2.9630255699157715, - "logps/chosen": -222.86685180664062, - "logps/rejected": -363.20196533203125, - "loss": 0.4875, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.03015981614589691, - "rewards/margins": 2.9296727180480957, - "rewards/rejected": -2.899512767791748, + "logits/chosen": -2.8186769485473633, + "logits/rejected": -2.9040703773498535, + "logps/chosen": -217.84945678710938, + "logps/rejected": -313.4012145996094, + "loss": 0.3481, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.5318971872329712, + "rewards/margins": 4.486591815948486, + "rewards/rejected": -3.9546942710876465, "step": 1720 }, { "epoch": 0.89, "learning_rate": 3.9022757697456494e-07, - "logits/chosen": -2.8391387462615967, - "logits/rejected": -2.943779230117798, - "logps/chosen": -302.57977294921875, - "logps/rejected": -337.5728759765625, - "loss": 0.4998, + "logits/chosen": -2.725780487060547, + "logits/rejected": -2.806990146636963, + "logps/chosen": -304.6345520019531, + "logps/rejected": -304.32159423828125, + "loss": 0.4644, "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.36434197425842285, - "rewards/margins": 2.9870846271514893, - "rewards/rejected": -2.6227424144744873, + "rewards/chosen": 0.158871591091156, + "rewards/margins": 4.118539810180664, + "rewards/rejected": -3.959667921066284, "step": 1730 }, { "epoch": 0.9, "learning_rate": 3.8927137119908206e-07, - "logits/chosen": -2.919332981109619, - "logits/rejected": -2.9586234092712402, - "logps/chosen": -288.93377685546875, - "logps/rejected": -283.7138977050781, - "loss": 0.4699, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.00860520638525486, - "rewards/margins": 2.815186023712158, - "rewards/rejected": -2.823791742324829, + "logits/chosen": -2.859640598297119, + "logits/rejected": -2.8774573802948, + "logps/chosen": -286.22210693359375, + "logps/rejected": -238.7099609375, + "loss": 0.3978, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4959928095340729, + "rewards/margins": 4.634981632232666, + "rewards/rejected": -4.138988494873047, "step": 1740 }, { "epoch": 0.9, "learning_rate": 3.883151654235992e-07, - "logits/chosen": -2.885148286819458, - "logits/rejected": -2.940183401107788, - "logps/chosen": -286.89007568359375, - "logps/rejected": -312.8641662597656, - "loss": 0.4729, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.12667697668075562, - "rewards/margins": 2.965704917907715, - "rewards/rejected": -3.0923824310302734, + "logits/chosen": -2.814696788787842, + "logits/rejected": -2.8354735374450684, + "logps/chosen": -283.12353515625, + "logps/rejected": -266.81414794921875, + "loss": 0.3783, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.34605592489242554, + "rewards/margins": 5.262940883636475, + "rewards/rejected": -4.916884422302246, "step": 1750 }, { "epoch": 0.91, "learning_rate": 3.873589596481163e-07, - "logits/chosen": -2.898613929748535, - "logits/rejected": -2.983856439590454, - "logps/chosen": -218.9492645263672, - "logps/rejected": -316.5523986816406, - "loss": 0.5033, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": -0.018303874880075455, - "rewards/margins": 3.3192203044891357, - "rewards/rejected": -3.3375244140625, + "logits/chosen": -2.8274028301239014, + "logits/rejected": -2.8917644023895264, + "logps/chosen": -217.41976928710938, + "logps/rejected": -282.35736083984375, + "loss": 0.3576, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.37842267751693726, + "rewards/margins": 5.1851701736450195, + "rewards/rejected": -4.806746959686279, "step": 1760 }, { "epoch": 0.91, "learning_rate": 3.864027538726334e-07, - "logits/chosen": -2.874910593032837, - "logits/rejected": -2.951063394546509, - "logps/chosen": -181.5255584716797, - "logps/rejected": -294.26025390625, - "loss": 0.4505, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.6372448205947876, - "rewards/margins": 3.9570529460906982, - "rewards/rejected": -3.3198082447052, + "logits/chosen": -2.810732364654541, + "logits/rejected": -2.863830327987671, + "logps/chosen": -176.0150909423828, + "logps/rejected": -268.5399169921875, + "loss": 0.3594, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 1.1882919073104858, + "rewards/margins": 5.754234790802002, + "rewards/rejected": -4.565942764282227, "step": 1770 }, { "epoch": 0.92, "learning_rate": 3.8544654809715047e-07, - "logits/chosen": -2.9166131019592285, - "logits/rejected": -2.9520506858825684, - "logps/chosen": -233.5117645263672, - "logps/rejected": -308.47698974609375, - "loss": 0.4564, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": -0.3196285665035248, - "rewards/margins": 2.8492321968078613, - "rewards/rejected": -3.168860673904419, + "logits/chosen": -2.849961280822754, + "logits/rejected": -2.8503620624542236, + "logps/chosen": -227.5367889404297, + "logps/rejected": -266.6046142578125, + "loss": 0.3501, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.27734023332595825, + "rewards/margins": 4.254061698913574, + "rewards/rejected": -3.976722002029419, "step": 1780 }, { "epoch": 0.92, "learning_rate": 3.844903423216676e-07, - "logits/chosen": -2.9044277667999268, - "logits/rejected": -2.9424145221710205, - "logps/chosen": -222.7826690673828, - "logps/rejected": -291.5748596191406, - "loss": 0.4887, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": -0.017815064638853073, - "rewards/margins": 2.7665205001831055, - "rewards/rejected": -2.7843356132507324, + "logits/chosen": -2.808382511138916, + "logits/rejected": -2.801363706588745, + "logps/chosen": -217.94189453125, + "logps/rejected": -265.8270568847656, + "loss": 0.3462, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.46631383895874023, + "rewards/margins": 5.178217887878418, + "rewards/rejected": -4.711904048919678, "step": 1790 }, { "epoch": 0.93, "learning_rate": 3.835341365461847e-07, - "logits/chosen": -2.8614282608032227, - "logits/rejected": -2.894472122192383, - "logps/chosen": -240.697021484375, - "logps/rejected": -293.87872314453125, - "loss": 0.4783, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": -0.362933486700058, - "rewards/margins": 2.6759090423583984, - "rewards/rejected": -3.0388426780700684, + "logits/chosen": -2.771941661834717, + "logits/rejected": -2.7673659324645996, + "logps/chosen": -232.48983764648438, + "logps/rejected": -230.67874145507812, + "loss": 0.3281, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": 0.46895384788513184, + "rewards/margins": 5.733715534210205, + "rewards/rejected": -5.264761924743652, "step": 1800 }, { "epoch": 0.93, - "eval_logits/chosen": -2.886509418487549, - "eval_logits/rejected": -2.953368902206421, - "eval_logps/chosen": -246.73577880859375, - "eval_logps/rejected": -306.0054931640625, - "eval_loss": 0.44217291474342346, - "eval_rewards/accuracies": 0.8040000200271606, - "eval_rewards/chosen": 0.09443826228380203, - "eval_rewards/margins": 3.081247329711914, - "eval_rewards/rejected": -2.986809015274048, - "eval_runtime": 278.8656, - "eval_samples_per_second": 7.172, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.8150792121887207, + "eval_logits/rejected": -2.854769229888916, + "eval_logps/chosen": -240.8274383544922, + "eval_logps/rejected": -269.51165771484375, + "eval_loss": 0.3315921723842621, + "eval_rewards/accuracies": 0.8320000171661377, + "eval_rewards/chosen": 0.6987060904502869, + "eval_rewards/margins": 5.076486110687256, + "eval_rewards/rejected": -4.377779960632324, + "eval_runtime": 276.5442, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 0.452, "step": 1800 }, { "epoch": 0.93, "learning_rate": 3.825779307707018e-07, - "logits/chosen": -2.8623242378234863, - "logits/rejected": -2.9309682846069336, - "logps/chosen": -133.83680725097656, - "logps/rejected": -273.0003662109375, - "loss": 0.4085, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.12987324595451355, - "rewards/margins": 3.0977888107299805, - "rewards/rejected": -2.9679155349731445, + "logits/chosen": -2.7842040061950684, + "logits/rejected": -2.826920986175537, + "logps/chosen": -130.18197631835938, + "logps/rejected": -253.33642578125, + "loss": 0.2875, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.4951147139072418, + "rewards/margins": 5.295865058898926, + "rewards/rejected": -4.800750255584717, "step": 1810 }, { "epoch": 0.94, "learning_rate": 3.8162172499521893e-07, - "logits/chosen": -2.8795676231384277, - "logits/rejected": -2.9275200366973877, - "logps/chosen": -240.89450073242188, - "logps/rejected": -249.7790985107422, - "loss": 0.5003, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.2536230683326721, - "rewards/margins": 3.140211582183838, - "rewards/rejected": -2.8865885734558105, + "logits/chosen": -2.8218014240264893, + "logits/rejected": -2.8114418983459473, + "logps/chosen": -237.9056854248047, + "logps/rejected": -218.66934204101562, + "loss": 0.3124, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.5525059103965759, + "rewards/margins": 5.423817157745361, + "rewards/rejected": -4.871310710906982, "step": 1820 }, { "epoch": 0.94, "learning_rate": 3.8066551921973605e-07, - "logits/chosen": -2.8502426147460938, - "logits/rejected": -2.9374024868011475, - "logps/chosen": -202.7273712158203, - "logps/rejected": -271.82916259765625, - "loss": 0.4376, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.48944076895713806, - "rewards/margins": 2.6945252418518066, - "rewards/rejected": -2.2050845623016357, + "logits/chosen": -2.778608798980713, + "logits/rejected": -2.839911699295044, + "logps/chosen": -208.61685180664062, + "logps/rejected": -267.01953125, + "loss": 0.3657, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.4716556668281555, + "rewards/margins": 5.098639011383057, + "rewards/rejected": -4.626982688903809, "step": 1830 }, { "epoch": 0.95, "learning_rate": 3.7970931344425316e-07, - "logits/chosen": -2.878389358520508, - "logits/rejected": -2.899590015411377, - "logps/chosen": -236.1371307373047, - "logps/rejected": -273.2381591796875, - "loss": 0.345, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.30246537923812866, - "rewards/margins": 2.5328197479248047, - "rewards/rejected": -2.2303545475006104, + "logits/chosen": -2.8139288425445557, + "logits/rejected": -2.786726474761963, + "logps/chosen": -233.8386688232422, + "logps/rejected": -261.02294921875, + "loss": 0.2579, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.5330501794815063, + "rewards/margins": 4.504693508148193, + "rewards/rejected": -3.9716439247131348, "step": 1840 }, { "epoch": 0.96, "learning_rate": 3.787531076687703e-07, - "logits/chosen": -2.7709901332855225, - "logits/rejected": -2.8305325508117676, - "logps/chosen": -241.78689575195312, - "logps/rejected": -254.30398559570312, - "loss": 0.5149, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": -0.11071042716503143, - "rewards/margins": 2.531583070755005, - "rewards/rejected": -2.642293691635132, + "logits/chosen": -2.658464193344116, + "logits/rejected": -2.7157464027404785, + "logps/chosen": -239.3363800048828, + "logps/rejected": -212.64047241210938, + "loss": 0.3301, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.1343434602022171, + "rewards/margins": 4.933978080749512, + "rewards/rejected": -4.7996344566345215, "step": 1850 }, { "epoch": 0.96, "learning_rate": 3.7779690189328745e-07, - "logits/chosen": -2.9073691368103027, - "logits/rejected": -2.941469669342041, - "logps/chosen": -241.056884765625, - "logps/rejected": -272.6620178222656, - "loss": 0.4533, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.14580278098583221, - "rewards/margins": 2.6054463386535645, - "rewards/rejected": -2.751249313354492, + "logits/chosen": -2.796912908554077, + "logits/rejected": -2.755492925643921, + "logps/chosen": -237.4822998046875, + "logps/rejected": -219.21725463867188, + "loss": 0.3422, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.21495482325553894, + "rewards/margins": 4.614625453948975, + "rewards/rejected": -4.399670600891113, "step": 1860 }, { "epoch": 0.97, "learning_rate": 3.7684069611780456e-07, - "logits/chosen": -2.8743133544921875, - "logits/rejected": -2.9183545112609863, - "logps/chosen": -288.95733642578125, - "logps/rejected": -294.15338134765625, - "loss": 0.5509, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.27595505118370056, - "rewards/margins": 2.88635516166687, - "rewards/rejected": -2.6103999614715576, + "logits/chosen": -2.7794861793518066, + "logits/rejected": -2.8035168647766113, + "logps/chosen": -285.71453857421875, + "logps/rejected": -259.53570556640625, + "loss": 0.4352, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.6002339720726013, + "rewards/margins": 5.307577610015869, + "rewards/rejected": -4.707343101501465, "step": 1870 }, { "epoch": 0.97, "learning_rate": 3.758844903423217e-07, - "logits/chosen": -2.9189352989196777, - "logits/rejected": -2.9405598640441895, - "logps/chosen": -203.60948181152344, - "logps/rejected": -290.3697814941406, - "loss": 0.4634, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.3353397250175476, - "rewards/margins": 2.54882550239563, - "rewards/rejected": -2.2134859561920166, + "logits/chosen": -2.83447003364563, + "logits/rejected": -2.842846393585205, + "logps/chosen": -203.15292358398438, + "logps/rejected": -283.0858154296875, + "loss": 0.4156, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.38364094495773315, + "rewards/margins": 4.436356544494629, + "rewards/rejected": -4.05271577835083, "step": 1880 }, { "epoch": 0.98, "learning_rate": 3.749282845668388e-07, - "logits/chosen": -2.8900084495544434, - "logits/rejected": -2.9685158729553223, - "logps/chosen": -245.86343383789062, - "logps/rejected": -315.4371643066406, - "loss": 0.4088, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.39556270837783813, - "rewards/margins": 2.9713571071624756, - "rewards/rejected": -2.575794219970703, + "logits/chosen": -2.8184475898742676, + "logits/rejected": -2.877253770828247, + "logps/chosen": -245.83517456054688, + "logps/rejected": -283.7890625, + "loss": 0.3182, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.6044051051139832, + "rewards/margins": 4.82285213470459, + "rewards/rejected": -4.218446731567383, "step": 1890 }, { "epoch": 0.98, "learning_rate": 3.739720787913559e-07, - "logits/chosen": -2.884178400039673, - "logits/rejected": -2.942009687423706, - "logps/chosen": -269.1268310546875, - "logps/rejected": -278.32073974609375, - "loss": 0.465, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.03307962417602539, - "rewards/margins": 2.16691255569458, - "rewards/rejected": -2.1338324546813965, + "logits/chosen": -2.8063135147094727, + "logits/rejected": -2.8252980709075928, + "logps/chosen": -267.6496276855469, + "logps/rejected": -243.62979125976562, + "loss": 0.3652, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.18056640028953552, + "rewards/margins": 3.934478282928467, + "rewards/rejected": -3.7539114952087402, "step": 1900 }, { "epoch": 0.98, - "eval_logits/chosen": -2.871323585510254, - "eval_logits/rejected": -2.9355289936065674, - "eval_logps/chosen": -242.65213012695312, - "eval_logps/rejected": -299.46307373046875, - "eval_loss": 0.4434332847595215, - "eval_rewards/accuracies": 0.7960000038146973, - "eval_rewards/chosen": 0.5028029680252075, - "eval_rewards/margins": 2.835366725921631, - "eval_rewards/rejected": -2.3325634002685547, - "eval_runtime": 278.765, - "eval_samples_per_second": 7.175, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.8050785064697266, + "eval_logits/rejected": -2.8455307483673096, + "eval_logps/chosen": -240.4807586669922, + "eval_logps/rejected": -267.376220703125, + "eval_loss": 0.3273419141769409, + "eval_rewards/accuracies": 0.8259999752044678, + "eval_rewards/chosen": 0.7333716750144958, + "eval_rewards/margins": 4.897610187530518, + "eval_rewards/rejected": -4.164239406585693, + "eval_runtime": 276.4305, + "eval_samples_per_second": 7.235, + "eval_steps_per_second": 0.452, "step": 1900 }, { "epoch": 0.99, "learning_rate": 3.73015873015873e-07, - "logits/chosen": -2.8448338508605957, - "logits/rejected": -2.9431285858154297, - "logps/chosen": -256.785400390625, - "logps/rejected": -319.0350036621094, - "loss": 0.423, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.6242324709892273, - "rewards/margins": 2.828500270843506, - "rewards/rejected": -2.204267978668213, + "logits/chosen": -2.768927574157715, + "logits/rejected": -2.8498053550720215, + "logps/chosen": -253.6636199951172, + "logps/rejected": -293.38055419921875, + "loss": 0.375, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 1.208627462387085, + "rewards/margins": 4.716521263122559, + "rewards/rejected": -3.5078938007354736, "step": 1910 }, { "epoch": 0.99, "learning_rate": 3.7205966724039014e-07, - "logits/chosen": -2.7674853801727295, - "logits/rejected": -2.814272403717041, - "logps/chosen": -245.8392791748047, - "logps/rejected": -304.9154968261719, - "loss": 0.4506, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.4677479863166809, - "rewards/margins": 2.518876075744629, - "rewards/rejected": -2.0511279106140137, + "logits/chosen": -2.694674253463745, + "logits/rejected": -2.7329392433166504, + "logps/chosen": -242.2804412841797, + "logps/rejected": -289.232421875, + "loss": 0.3075, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.8236311674118042, + "rewards/margins": 5.207705497741699, + "rewards/rejected": -4.384073734283447, "step": 1920 }, { "epoch": 1.0, "learning_rate": 3.711034614649072e-07, - "logits/chosen": -2.857226848602295, - "logits/rejected": -2.9040684700012207, - "logps/chosen": -250.416259765625, - "logps/rejected": -296.86212158203125, - "loss": 0.4355, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.7089163064956665, - "rewards/margins": 3.51921010017395, - "rewards/rejected": -2.810293436050415, + "logits/chosen": -2.801443099975586, + "logits/rejected": -2.817596912384033, + "logps/chosen": -247.8707275390625, + "logps/rejected": -279.2347106933594, + "loss": 0.391, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.9620423316955566, + "rewards/margins": 5.517745018005371, + "rewards/rejected": -4.555703163146973, "step": 1930 }, { "epoch": 1.0, "learning_rate": 3.701472556894243e-07, - "logits/chosen": -2.783198833465576, - "logits/rejected": -2.8520195484161377, - "logps/chosen": -234.9251251220703, - "logps/rejected": -343.29779052734375, - "loss": 0.3792, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.32631635665893555, - "rewards/margins": 3.4934210777282715, - "rewards/rejected": -3.167104721069336, + "logits/chosen": -2.7240657806396484, + "logits/rejected": -2.7411351203918457, + "logps/chosen": -233.58255004882812, + "logps/rejected": -296.0520935058594, + "loss": 0.3419, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": 0.631440281867981, + "rewards/margins": 5.470465660095215, + "rewards/rejected": -4.83902645111084, "step": 1940 }, { "epoch": 1.01, "learning_rate": 3.6919104991394144e-07, - "logits/chosen": -2.869868755340576, - "logits/rejected": -2.9353652000427246, - "logps/chosen": -200.75564575195312, - "logps/rejected": -300.0337829589844, - "loss": 0.1056, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.424803376197815, - "rewards/margins": 5.865682601928711, - "rewards/rejected": -4.4408793449401855, + "logits/chosen": -2.8222758769989014, + "logits/rejected": -2.8598618507385254, + "logps/chosen": -196.82986450195312, + "logps/rejected": -249.7384796142578, + "loss": 0.0756, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7974284887313843, + "rewards/margins": 7.206450462341309, + "rewards/rejected": -5.409022331237793, "step": 1950 }, { "epoch": 1.01, "learning_rate": 3.6823484413845855e-07, - "logits/chosen": -2.823423385620117, - "logits/rejected": -2.85945463180542, - "logps/chosen": -277.2393798828125, - "logps/rejected": -334.59613037109375, - "loss": 0.105, + "logits/chosen": -2.7665610313415527, + "logits/rejected": -2.774704694747925, + "logps/chosen": -274.900634765625, + "logps/rejected": -294.66754150390625, + "loss": 0.106, "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.2674754858016968, - "rewards/margins": 6.0102996826171875, - "rewards/rejected": -4.742823600769043, + "rewards/chosen": 1.501900315284729, + "rewards/margins": 7.5810699462890625, + "rewards/rejected": -6.079169750213623, "step": 1960 }, { "epoch": 1.02, "learning_rate": 3.6727863836297567e-07, - "logits/chosen": -2.8262939453125, - "logits/rejected": -2.855597972869873, - "logps/chosen": -255.54296875, - "logps/rejected": -308.5851135253906, - "loss": 0.1366, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3961846828460693, - "rewards/margins": 6.024592399597168, - "rewards/rejected": -4.628407001495361, + "logits/chosen": -2.7974960803985596, + "logits/rejected": -2.8243980407714844, + "logps/chosen": -254.9295654296875, + "logps/rejected": -285.19512939453125, + "loss": 0.0836, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.8323204517364502, + "rewards/margins": 7.624319553375244, + "rewards/rejected": -5.791998863220215, "step": 1970 }, { "epoch": 1.02, "learning_rate": 3.663224325874928e-07, - "logits/chosen": -2.7664477825164795, - "logits/rejected": -2.8171486854553223, - "logps/chosen": -189.17161560058594, - "logps/rejected": -328.0600280761719, - "loss": 0.0893, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.5123683214187622, - "rewards/margins": 6.919796943664551, - "rewards/rejected": -5.407429218292236, + "logits/chosen": -2.769134998321533, + "logits/rejected": -2.775172710418701, + "logps/chosen": -187.74087524414062, + "logps/rejected": -283.3396911621094, + "loss": 0.0512, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6562093496322632, + "rewards/margins": 8.434819221496582, + "rewards/rejected": -6.7786102294921875, "step": 1980 }, { "epoch": 1.03, "learning_rate": 3.653662268120099e-07, - "logits/chosen": -2.76231050491333, - "logits/rejected": -2.8225061893463135, - "logps/chosen": -216.73330688476562, - "logps/rejected": -362.13922119140625, - "loss": 0.0676, + "logits/chosen": -2.743010997772217, + "logits/rejected": -2.80153751373291, + "logps/chosen": -217.8404083251953, + "logps/rejected": -331.8171081542969, + "loss": 0.0415, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.8166065216064453, - "rewards/margins": 6.026690483093262, - "rewards/rejected": -4.210083961486816, + "rewards/chosen": 2.2021520137786865, + "rewards/margins": 7.949954986572266, + "rewards/rejected": -5.747802257537842, "step": 1990 }, { "epoch": 1.03, "learning_rate": 3.6441002103652707e-07, - "logits/chosen": -2.784984588623047, - "logits/rejected": -2.8118598461151123, - "logps/chosen": -235.13330078125, - "logps/rejected": -352.83721923828125, - "loss": 0.0921, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2493923902511597, - "rewards/margins": 6.322751045227051, - "rewards/rejected": -5.073358535766602, + "logits/chosen": -2.7709929943084717, + "logits/rejected": -2.7828145027160645, + "logps/chosen": -227.2114715576172, + "logps/rejected": -352.28521728515625, + "loss": 0.0329, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.041569948196411, + "rewards/margins": 8.33797836303711, + "rewards/rejected": -6.296408176422119, "step": 2000 }, { "epoch": 1.03, - "eval_logits/chosen": -2.785792589187622, - "eval_logits/rejected": -2.8519487380981445, - "eval_logps/chosen": -246.1128387451172, - "eval_logps/rejected": -310.61309814453125, - "eval_loss": 0.4446970224380493, - "eval_rewards/accuracies": 0.8119999766349792, - "eval_rewards/chosen": 0.15673115849494934, - "eval_rewards/margins": 3.604296922683716, - "eval_rewards/rejected": -3.4475655555725098, - "eval_runtime": 278.4785, - "eval_samples_per_second": 7.182, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.7748961448669434, + "eval_logits/rejected": -2.812722682952881, + "eval_logps/chosen": -240.4134979248047, + "eval_logps/rejected": -274.0543212890625, + "eval_loss": 0.3242701292037964, + "eval_rewards/accuracies": 0.8360000252723694, + "eval_rewards/chosen": 0.7400988936424255, + "eval_rewards/margins": 5.572147369384766, + "eval_rewards/rejected": -4.832048416137695, + "eval_runtime": 276.5288, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 0.452, "step": 2000 }, { "epoch": 1.04, "learning_rate": 3.634538152610442e-07, - "logits/chosen": -2.8091163635253906, - "logits/rejected": -2.867849826812744, - "logps/chosen": -263.2342224121094, - "logps/rejected": -345.95953369140625, - "loss": 0.0674, + "logits/chosen": -2.7888762950897217, + "logits/rejected": -2.8140132427215576, + "logps/chosen": -264.2323913574219, + "logps/rejected": -307.06207275390625, + "loss": 0.0456, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.6022307872772217, - "rewards/margins": 6.758018493652344, - "rewards/rejected": -5.155787467956543, + "rewards/chosen": 1.5025798082351685, + "rewards/margins": 8.503244400024414, + "rewards/rejected": -7.000663757324219, "step": 2010 }, { "epoch": 1.04, "learning_rate": 3.624976094855613e-07, - "logits/chosen": -2.7523140907287598, - "logits/rejected": -2.8147144317626953, - "logps/chosen": -248.58163452148438, - "logps/rejected": -312.04058837890625, - "loss": 0.0759, + "logits/chosen": -2.7641804218292236, + "logits/rejected": -2.799376964569092, + "logps/chosen": -246.8909912109375, + "logps/rejected": -288.9852600097656, + "loss": 0.1283, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.417191743850708, - "rewards/margins": 6.807399749755859, - "rewards/rejected": -5.3902082443237305, + "rewards/chosen": 1.5857131481170654, + "rewards/margins": 8.588592529296875, + "rewards/rejected": -7.0028791427612305, "step": 2020 }, { "epoch": 1.05, "learning_rate": 3.615414037100784e-07, - "logits/chosen": -2.6939048767089844, - "logits/rejected": -2.765730142593384, - "logps/chosen": -207.52597045898438, - "logps/rejected": -296.8516540527344, - "loss": 0.1055, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.1127474308013916, - "rewards/margins": 6.433894157409668, - "rewards/rejected": -5.3211469650268555, + "logits/chosen": -2.7224791049957275, + "logits/rejected": -2.7492153644561768, + "logps/chosen": -201.63681030273438, + "logps/rejected": -249.47348022460938, + "loss": 0.0488, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7034696340560913, + "rewards/margins": 8.281048774719238, + "rewards/rejected": -6.577579498291016, "step": 2030 }, { "epoch": 1.05, "learning_rate": 3.6058519793459553e-07, - "logits/chosen": -2.743504047393799, - "logits/rejected": -2.8185174465179443, - "logps/chosen": -245.7578125, - "logps/rejected": -285.220703125, - "loss": 0.0985, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.0196592807769775, - "rewards/margins": 6.537779331207275, - "rewards/rejected": -4.518120288848877, + "logits/chosen": -2.7428953647613525, + "logits/rejected": -2.770498037338257, + "logps/chosen": -248.51657104492188, + "logps/rejected": -253.31396484375, + "loss": 0.0533, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7437824010849, + "rewards/margins": 7.630038261413574, + "rewards/rejected": -5.886256217956543, "step": 2040 }, { "epoch": 1.06, "learning_rate": 3.5962899215911265e-07, - "logits/chosen": -2.7714102268218994, - "logits/rejected": -2.8159098625183105, - "logps/chosen": -169.52871704101562, - "logps/rejected": -333.046630859375, - "loss": 0.0756, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3516714572906494, - "rewards/margins": 5.515862941741943, - "rewards/rejected": -4.164191246032715, + "logits/chosen": -2.7590394020080566, + "logits/rejected": -2.805523633956909, + "logps/chosen": -165.6067352294922, + "logps/rejected": -318.6546325683594, + "loss": 0.0564, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7025703191757202, + "rewards/margins": 7.262213230133057, + "rewards/rejected": -5.559643268585205, "step": 2050 }, { "epoch": 1.06, "learning_rate": 3.5867278638362976e-07, - "logits/chosen": -2.7787039279937744, - "logits/rejected": -2.8366940021514893, - "logps/chosen": -302.2014465332031, - "logps/rejected": -352.3808898925781, - "loss": 0.0629, + "logits/chosen": -2.7824246883392334, + "logits/rejected": -2.819293260574341, + "logps/chosen": -303.74835205078125, + "logps/rejected": -257.7971496582031, + "loss": 0.0852, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.691415786743164, - "rewards/margins": 7.102464199066162, - "rewards/rejected": -5.411048889160156, + "rewards/chosen": 1.5363743305206299, + "rewards/margins": 8.307432174682617, + "rewards/rejected": -6.771058082580566, "step": 2060 }, { "epoch": 1.07, "learning_rate": 3.577165806081469e-07, - "logits/chosen": -2.8112542629241943, - "logits/rejected": -2.8483777046203613, - "logps/chosen": -250.97900390625, - "logps/rejected": -293.00360107421875, - "loss": 0.097, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.0092226266860962, - "rewards/margins": 5.730241298675537, - "rewards/rejected": -4.7210187911987305, + "logits/chosen": -2.820133924484253, + "logits/rejected": -2.8261427879333496, + "logps/chosen": -250.80490112304688, + "logps/rejected": -262.58880615234375, + "loss": 0.0693, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0266307592391968, + "rewards/margins": 7.17365026473999, + "rewards/rejected": -6.147019863128662, "step": 2070 }, { "epoch": 1.07, "learning_rate": 3.56760374832664e-07, - "logits/chosen": -2.828165292739868, - "logits/rejected": -2.830998420715332, - "logps/chosen": -285.7459411621094, - "logps/rejected": -356.6951599121094, - "loss": 0.0637, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.7443921566009521, - "rewards/margins": 7.199429988861084, - "rewards/rejected": -5.455037593841553, + "logits/chosen": -2.826376438140869, + "logits/rejected": -2.769313335418701, + "logps/chosen": -281.1196594238281, + "logps/rejected": -289.2098083496094, + "loss": 0.0464, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2080843448638916, + "rewards/margins": 8.848880767822266, + "rewards/rejected": -6.640796661376953, "step": 2080 }, { "epoch": 1.08, "learning_rate": 3.5580416905718106e-07, - "logits/chosen": -2.7509326934814453, - "logits/rejected": -2.816725492477417, - "logps/chosen": -279.53314208984375, - "logps/rejected": -309.99639892578125, - "loss": 0.0915, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.6722646951675415, - "rewards/margins": 7.914118766784668, - "rewards/rejected": -6.241853713989258, + "logits/chosen": -2.757836103439331, + "logits/rejected": -2.7929940223693848, + "logps/chosen": -280.5599670410156, + "logps/rejected": -275.11138916015625, + "loss": 0.0807, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7797348499298096, + "rewards/margins": 8.384897232055664, + "rewards/rejected": -6.605162143707275, "step": 2090 }, { "epoch": 1.08, "learning_rate": 3.5484796328169817e-07, - "logits/chosen": -2.7287991046905518, - "logits/rejected": -2.78497052192688, - "logps/chosen": -244.94091796875, - "logps/rejected": -279.9932861328125, - "loss": 0.0776, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.1437629461288452, - "rewards/margins": 7.133315086364746, - "rewards/rejected": -5.9895524978637695, + "logits/chosen": -2.7502427101135254, + "logits/rejected": -2.736806631088257, + "logps/chosen": -243.5753631591797, + "logps/rejected": -229.602783203125, + "loss": 0.0473, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2803195714950562, + "rewards/margins": 7.868816375732422, + "rewards/rejected": -6.588496208190918, "step": 2100 }, { "epoch": 1.08, - "eval_logits/chosen": -2.7763452529907227, - "eval_logits/rejected": -2.841165542602539, - "eval_logps/chosen": -246.7716522216797, - "eval_logps/rejected": -315.55926513671875, - "eval_loss": 0.47759073972702026, - "eval_rewards/accuracies": 0.8140000104904175, - "eval_rewards/chosen": 0.09085023403167725, - "eval_rewards/margins": 4.033032417297363, - "eval_rewards/rejected": -3.9421823024749756, - "eval_runtime": 278.7756, - "eval_samples_per_second": 7.174, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.775683641433716, + "eval_logits/rejected": -2.8128180503845215, + "eval_logps/chosen": -241.82281494140625, + "eval_logps/rejected": -278.80523681640625, + "eval_loss": 0.3209003210067749, + "eval_rewards/accuracies": 0.843999981880188, + "eval_rewards/chosen": 0.599168598651886, + "eval_rewards/margins": 5.906309127807617, + "eval_rewards/rejected": -5.307140827178955, + "eval_runtime": 276.2922, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 0.452, "step": 2100 }, { "epoch": 1.09, "learning_rate": 3.538917575062153e-07, - "logits/chosen": -2.7345612049102783, - "logits/rejected": -2.7558319568634033, - "logps/chosen": -287.73345947265625, - "logps/rejected": -363.9012145996094, - "loss": 0.0926, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3527368307113647, - "rewards/margins": 7.984021186828613, - "rewards/rejected": -6.631285190582275, + "logits/chosen": -2.714613199234009, + "logits/rejected": -2.717787504196167, + "logps/chosen": -281.52374267578125, + "logps/rejected": -318.97271728515625, + "loss": 0.1054, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.9737049341201782, + "rewards/margins": 9.622587203979492, + "rewards/rejected": -7.648881435394287, "step": 2110 }, { "epoch": 1.09, "learning_rate": 3.529355517307324e-07, - "logits/chosen": -2.761762857437134, - "logits/rejected": -2.779594898223877, - "logps/chosen": -236.8489990234375, - "logps/rejected": -340.5326843261719, - "loss": 0.052, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.2858283519744873, - "rewards/margins": 7.696159362792969, - "rewards/rejected": -6.410330772399902, + "logits/chosen": -2.7420895099639893, + "logits/rejected": -2.729947566986084, + "logps/chosen": -236.37734985351562, + "logps/rejected": -297.0089111328125, + "loss": 0.0564, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3335802555084229, + "rewards/margins": 8.861669540405273, + "rewards/rejected": -7.5280890464782715, "step": 2120 }, { "epoch": 1.1, "learning_rate": 3.519793459552495e-07, - "logits/chosen": -2.776559352874756, - "logits/rejected": -2.8439061641693115, - "logps/chosen": -260.61431884765625, - "logps/rejected": -348.29681396484375, - "loss": 0.1065, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.6920270919799805, - "rewards/margins": 6.389278888702393, - "rewards/rejected": -5.697251796722412, + "logits/chosen": -2.762181043624878, + "logits/rejected": -2.8062615394592285, + "logps/chosen": -258.14434814453125, + "logps/rejected": -317.340576171875, + "loss": 0.1917, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.9390251040458679, + "rewards/margins": 7.926877498626709, + "rewards/rejected": -6.987851619720459, "step": 2130 }, { "epoch": 1.1, "learning_rate": 3.510231401797667e-07, - "logits/chosen": -2.8243560791015625, - "logits/rejected": -2.829003095626831, - "logps/chosen": -294.7447204589844, - "logps/rejected": -368.647216796875, - "loss": 0.0906, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.7448446750640869, - "rewards/margins": 8.040909767150879, - "rewards/rejected": -7.296065330505371, + "logits/chosen": -2.8115668296813965, + "logits/rejected": -2.783649444580078, + "logps/chosen": -287.54443359375, + "logps/rejected": -336.9176025390625, + "loss": 0.1123, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.4648735523223877, + "rewards/margins": 10.212109565734863, + "rewards/rejected": -8.747235298156738, "step": 2140 }, { "epoch": 1.11, "learning_rate": 3.500669344042838e-07, - "logits/chosen": -2.7932868003845215, - "logits/rejected": -2.8034961223602295, - "logps/chosen": -227.6187744140625, - "logps/rejected": -343.4903869628906, - "loss": 0.0865, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5658053159713745, - "rewards/margins": 7.489978790283203, - "rewards/rejected": -6.924172878265381, + "logits/chosen": -2.784982681274414, + "logits/rejected": -2.7810211181640625, + "logps/chosen": -219.8955841064453, + "logps/rejected": -290.2805480957031, + "loss": 0.0645, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3381234407424927, + "rewards/margins": 8.486083984375, + "rewards/rejected": -7.147960662841797, "step": 2150 }, { "epoch": 1.12, "learning_rate": 3.491107286288009e-07, - "logits/chosen": -2.7819812297821045, - "logits/rejected": -2.839261531829834, - "logps/chosen": -238.858154296875, - "logps/rejected": -349.450927734375, - "loss": 0.0682, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.2682182788848877, - "rewards/margins": 7.554483890533447, - "rewards/rejected": -6.286264896392822, + "logits/chosen": -2.765918731689453, + "logits/rejected": -2.8265442848205566, + "logps/chosen": -237.5640411376953, + "logps/rejected": -286.0890808105469, + "loss": 0.08, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.398202896118164, + "rewards/margins": 8.822298049926758, + "rewards/rejected": -7.424094200134277, "step": 2160 }, { "epoch": 1.12, "learning_rate": 3.4815452285331803e-07, - "logits/chosen": -2.733638048171997, - "logits/rejected": -2.7781291007995605, - "logps/chosen": -301.14105224609375, - "logps/rejected": -330.53912353515625, - "loss": 0.0731, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.8851715326309204, - "rewards/margins": 6.45550537109375, - "rewards/rejected": -5.570334434509277, + "logits/chosen": -2.6913180351257324, + "logits/rejected": -2.7103495597839355, + "logps/chosen": -299.0403747558594, + "logps/rejected": -304.22344970703125, + "loss": 0.1165, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0295835733413696, + "rewards/margins": 8.502135276794434, + "rewards/rejected": -7.472552299499512, "step": 2170 }, { "epoch": 1.13, "learning_rate": 3.4719831707783515e-07, - "logits/chosen": -2.7247109413146973, - "logits/rejected": -2.740576982498169, - "logps/chosen": -304.32928466796875, - "logps/rejected": -386.80609130859375, - "loss": 0.0775, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.8642752766609192, - "rewards/margins": 7.015794277191162, - "rewards/rejected": -6.151518821716309, + "logits/chosen": -2.7278690338134766, + "logits/rejected": -2.7369189262390137, + "logps/chosen": -307.92047119140625, + "logps/rejected": -321.743408203125, + "loss": 0.0638, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9634364247322083, + "rewards/margins": 8.218239784240723, + "rewards/rejected": -7.254802703857422, "step": 2180 }, { "epoch": 1.13, "learning_rate": 3.4624211130235227e-07, - "logits/chosen": -2.798527956008911, - "logits/rejected": -2.8238823413848877, - "logps/chosen": -205.30416870117188, - "logps/rejected": -305.39337158203125, - "loss": 0.0775, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6926091909408569, - "rewards/margins": 7.436480522155762, - "rewards/rejected": -6.743871212005615, + "logits/chosen": -2.826801300048828, + "logits/rejected": -2.8246359825134277, + "logps/chosen": -200.2306671142578, + "logps/rejected": -263.4684753417969, + "loss": 0.0517, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2014269828796387, + "rewards/margins": 8.948549270629883, + "rewards/rejected": -7.747122287750244, "step": 2190 }, { "epoch": 1.14, "learning_rate": 3.452859055268694e-07, - "logits/chosen": -2.7647600173950195, - "logits/rejected": -2.799379348754883, - "logps/chosen": -225.5302276611328, - "logps/rejected": -308.9101867675781, - "loss": 0.0679, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.8118773698806763, - "rewards/margins": 7.733236789703369, - "rewards/rejected": -6.921359062194824, + "logits/chosen": -2.806624412536621, + "logits/rejected": -2.8418326377868652, + "logps/chosen": -213.3162841796875, + "logps/rejected": -280.90850830078125, + "loss": 0.0519, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0332722663879395, + "rewards/margins": 9.945501327514648, + "rewards/rejected": -7.912228584289551, "step": 2200 }, { "epoch": 1.14, - "eval_logits/chosen": -2.744640588760376, - "eval_logits/rejected": -2.808490037918091, - "eval_logps/chosen": -254.41099548339844, - "eval_logps/rejected": -324.3449401855469, - "eval_loss": 0.4769650995731354, - "eval_rewards/accuracies": 0.8240000009536743, - "eval_rewards/chosen": -0.6730862855911255, - "eval_rewards/margins": 4.147665977478027, - "eval_rewards/rejected": -4.820752143859863, - "eval_runtime": 278.654, - "eval_samples_per_second": 7.177, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.816742181777954, + "eval_logits/rejected": -2.8572001457214355, + "eval_logps/chosen": -243.08526611328125, + "eval_logps/rejected": -286.4280090332031, + "eval_loss": 0.3360002338886261, + "eval_rewards/accuracies": 0.8539999723434448, + "eval_rewards/chosen": 0.47292324900627136, + "eval_rewards/margins": 6.542343616485596, + "eval_rewards/rejected": -6.06942081451416, + "eval_runtime": 276.0273, + "eval_samples_per_second": 7.246, + "eval_steps_per_second": 0.453, "step": 2200 }, { "epoch": 1.14, "learning_rate": 3.443296997513865e-07, - "logits/chosen": -2.804619550704956, - "logits/rejected": -2.8267204761505127, - "logps/chosen": -268.3411560058594, - "logps/rejected": -381.73199462890625, - "loss": 0.0954, + "logits/chosen": -2.862398862838745, + "logits/rejected": -2.8665356636047363, + "logps/chosen": -253.4799041748047, + "logps/rejected": -350.1310119628906, + "loss": 0.0641, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.9453462362289429, - "rewards/margins": 7.497371673583984, - "rewards/rejected": -6.55202579498291, + "rewards/chosen": 2.4307162761688232, + "rewards/margins": 9.793815612792969, + "rewards/rejected": -7.363099098205566, "step": 2210 }, { "epoch": 1.15, "learning_rate": 3.433734939759036e-07, - "logits/chosen": -2.7686784267425537, - "logits/rejected": -2.811330556869507, - "logps/chosen": -259.18743896484375, - "logps/rejected": -403.6307373046875, - "loss": 0.0815, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2846524715423584, - "rewards/margins": 9.648810386657715, - "rewards/rejected": -8.36415958404541, + "logits/chosen": -2.816568613052368, + "logits/rejected": -2.8477704524993896, + "logps/chosen": -249.11514282226562, + "logps/rejected": -360.58209228515625, + "loss": 0.0455, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2918827533721924, + "rewards/margins": 10.806341171264648, + "rewards/rejected": -8.514456748962402, "step": 2220 }, { "epoch": 1.15, "learning_rate": 3.4241728820042073e-07, - "logits/chosen": -2.731107234954834, - "logits/rejected": -2.77121639251709, - "logps/chosen": -223.3089599609375, - "logps/rejected": -357.7713928222656, - "loss": 0.0832, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.13498951494693756, - "rewards/margins": 6.8718414306640625, - "rewards/rejected": -6.736852169036865, + "logits/chosen": -2.7673516273498535, + "logits/rejected": -2.784446954727173, + "logps/chosen": -219.42697143554688, + "logps/rejected": -302.0328369140625, + "loss": 0.0695, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.5231890082359314, + "rewards/margins": 8.274696350097656, + "rewards/rejected": -7.751506805419922, "step": 2230 }, { "epoch": 1.16, "learning_rate": 3.4146108242493784e-07, - "logits/chosen": -2.811983823776245, - "logits/rejected": -2.864361524581909, - "logps/chosen": -215.65737915039062, - "logps/rejected": -333.7270812988281, - "loss": 0.0782, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2699239253997803, - "rewards/margins": 7.710413932800293, - "rewards/rejected": -6.440489768981934, + "logits/chosen": -2.857614278793335, + "logits/rejected": -2.894073009490967, + "logps/chosen": -215.42770385742188, + "logps/rejected": -303.6365966796875, + "loss": 0.0516, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.7477014064788818, + "rewards/margins": 9.682977676391602, + "rewards/rejected": -7.935276031494141, "step": 2240 }, { "epoch": 1.16, "learning_rate": 3.405048766494549e-07, - "logits/chosen": -2.812655210494995, - "logits/rejected": -2.8382606506347656, - "logps/chosen": -280.080078125, - "logps/rejected": -329.3871154785156, - "loss": 0.0863, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.7157481908798218, - "rewards/margins": 7.961747646331787, - "rewards/rejected": -6.245999336242676, + "logits/chosen": -2.848982810974121, + "logits/rejected": -2.868595600128174, + "logps/chosen": -275.85260009765625, + "logps/rejected": -288.1164855957031, + "loss": 0.0823, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.1387085914611816, + "rewards/margins": 9.777128219604492, + "rewards/rejected": -7.638421058654785, "step": 2250 }, { "epoch": 1.17, "learning_rate": 3.39548670873972e-07, - "logits/chosen": -2.803392171859741, - "logits/rejected": -2.8344738483428955, - "logps/chosen": -296.22540283203125, - "logps/rejected": -401.93194580078125, - "loss": 0.1221, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.1003711223602295, - "rewards/margins": 9.793557167053223, - "rewards/rejected": -7.693185329437256, + "logits/chosen": -2.8419032096862793, + "logits/rejected": -2.8721580505371094, + "logps/chosen": -297.6785583496094, + "logps/rejected": -322.30316162109375, + "loss": 0.0536, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9545199871063232, + "rewards/margins": 11.350884437561035, + "rewards/rejected": -9.396364212036133, "step": 2260 }, { "epoch": 1.17, "learning_rate": 3.3859246509848914e-07, - "logits/chosen": -2.7670085430145264, - "logits/rejected": -2.8575000762939453, - "logps/chosen": -259.09197998046875, - "logps/rejected": -358.9950256347656, - "loss": 0.0765, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.5088821649551392, - "rewards/margins": 7.601584434509277, - "rewards/rejected": -6.0927019119262695, + "logits/chosen": -2.8102831840515137, + "logits/rejected": -2.87699818611145, + "logps/chosen": -249.8861846923828, + "logps/rejected": -331.9376220703125, + "loss": 0.0585, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.429464817047119, + "rewards/margins": 10.181089401245117, + "rewards/rejected": -7.75162410736084, "step": 2270 }, { "epoch": 1.18, "learning_rate": 3.376362593230063e-07, - "logits/chosen": -2.8007359504699707, - "logits/rejected": -2.861616611480713, - "logps/chosen": -227.0973663330078, - "logps/rejected": -363.10223388671875, - "loss": 0.076, + "logits/chosen": -2.819567918777466, + "logits/rejected": -2.8646624088287354, + "logps/chosen": -224.81201171875, + "logps/rejected": -310.34332275390625, + "loss": 0.0634, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.5983880758285522, - "rewards/margins": 9.013298034667969, - "rewards/rejected": -7.414910316467285, + "rewards/chosen": 1.7732893228530884, + "rewards/margins": 9.98145866394043, + "rewards/rejected": -8.208169937133789, "step": 2280 }, { "epoch": 1.18, "learning_rate": 3.366800535475234e-07, - "logits/chosen": -2.807025909423828, - "logits/rejected": -2.799335241317749, - "logps/chosen": -268.4599914550781, - "logps/rejected": -343.4675598144531, - "loss": 0.0852, + "logits/chosen": -2.7958028316497803, + "logits/rejected": -2.803018808364868, + "logps/chosen": -263.45465087890625, + "logps/rejected": -299.6593933105469, + "loss": 0.0483, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.4412739276885986, - "rewards/margins": 7.5395050048828125, - "rewards/rejected": -6.098231315612793, + "rewards/chosen": 1.9391990900039673, + "rewards/margins": 9.844877243041992, + "rewards/rejected": -7.905677795410156, "step": 2290 }, { "epoch": 1.19, "learning_rate": 3.3572384777204054e-07, - "logits/chosen": -2.8694748878479004, - "logits/rejected": -2.8965165615081787, - "logps/chosen": -247.4459686279297, - "logps/rejected": -319.42218017578125, - "loss": 0.0696, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2710659503936768, - "rewards/margins": 7.4833526611328125, - "rewards/rejected": -6.212286949157715, + "logits/chosen": -2.8425309658050537, + "logits/rejected": -2.823854923248291, + "logps/chosen": -248.36083984375, + "logps/rejected": -282.83660888671875, + "loss": 0.0637, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4616529941558838, + "rewards/margins": 10.066879272460938, + "rewards/rejected": -8.605226516723633, "step": 2300 }, { "epoch": 1.19, - "eval_logits/chosen": -2.8013522624969482, - "eval_logits/rejected": -2.8622031211853027, - "eval_logps/chosen": -247.92803955078125, - "eval_logps/rejected": -317.9334411621094, - "eval_loss": 0.4886242747306824, - "eval_rewards/accuracies": 0.8159999847412109, - "eval_rewards/chosen": -0.024788517504930496, - "eval_rewards/margins": 4.154811382293701, - "eval_rewards/rejected": -4.179599761962891, - "eval_runtime": 278.9579, - "eval_samples_per_second": 7.17, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.7743890285491943, + "eval_logits/rejected": -2.8083643913269043, + "eval_logps/chosen": -243.59608459472656, + "eval_logps/rejected": -286.77154541015625, + "eval_loss": 0.31729114055633545, + "eval_rewards/accuracies": 0.8500000238418579, + "eval_rewards/chosen": 0.42184263467788696, + "eval_rewards/margins": 6.525611400604248, + "eval_rewards/rejected": -6.103768348693848, + "eval_runtime": 276.4143, + "eval_samples_per_second": 7.236, + "eval_steps_per_second": 0.452, "step": 2300 }, { "epoch": 1.19, "learning_rate": 3.3476764199655765e-07, - "logits/chosen": -2.802063465118408, - "logits/rejected": -2.82922625541687, - "logps/chosen": -216.39633178710938, - "logps/rejected": -306.6519470214844, - "loss": 0.0655, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3033801317214966, - "rewards/margins": 7.051173210144043, - "rewards/rejected": -5.747794151306152, + "logits/chosen": -2.753267288208008, + "logits/rejected": -2.759813070297241, + "logps/chosen": -215.71096801757812, + "logps/rejected": -258.9280090332031, + "loss": 0.1212, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3726128339767456, + "rewards/margins": 8.379122734069824, + "rewards/rejected": -7.006511688232422, "step": 2310 }, { "epoch": 1.2, "learning_rate": 3.3381143622107477e-07, - "logits/chosen": -2.7702291011810303, - "logits/rejected": -2.840040683746338, - "logps/chosen": -283.4988708496094, - "logps/rejected": -366.2091369628906, - "loss": 0.046, + "logits/chosen": -2.700157880783081, + "logits/rejected": -2.757406234741211, + "logps/chosen": -276.79833984375, + "logps/rejected": -303.6136779785156, + "loss": 0.0463, "rewards/accuracies": 1.0, - "rewards/chosen": 1.2266050577163696, - "rewards/margins": 8.91787338256836, - "rewards/rejected": -7.691267967224121, + "rewards/chosen": 1.8958237171173096, + "rewards/margins": 10.118815422058105, + "rewards/rejected": -8.222991943359375, "step": 2320 }, { "epoch": 1.2, "learning_rate": 3.328552304455919e-07, - "logits/chosen": -2.8088457584381104, - "logits/rejected": -2.8401389122009277, - "logps/chosen": -233.0177001953125, - "logps/rejected": -323.83734130859375, - "loss": 0.076, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.7884088754653931, - "rewards/margins": 6.651333808898926, - "rewards/rejected": -5.862925052642822, + "logits/chosen": -2.7440972328186035, + "logits/rejected": -2.7411763668060303, + "logps/chosen": -230.77664184570312, + "logps/rejected": -297.2944641113281, + "loss": 0.0303, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0124015808105469, + "rewards/margins": 8.008207321166992, + "rewards/rejected": -6.9958062171936035, "step": 2330 }, { "epoch": 1.21, "learning_rate": 3.31899024670109e-07, - "logits/chosen": -2.820403575897217, - "logits/rejected": -2.8067612648010254, - "logps/chosen": -228.03482055664062, - "logps/rejected": -342.9540710449219, - "loss": 0.0836, + "logits/chosen": -2.7659590244293213, + "logits/rejected": -2.7217228412628174, + "logps/chosen": -232.265380859375, + "logps/rejected": -315.9333190917969, + "loss": 0.0436, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.4116134643554688, - "rewards/margins": 7.760331630706787, - "rewards/rejected": -6.34871768951416, + "rewards/chosen": 0.9885537028312683, + "rewards/margins": 9.558282852172852, + "rewards/rejected": -8.56972885131836, "step": 2340 }, { "epoch": 1.21, "learning_rate": 3.309428188946261e-07, - "logits/chosen": -2.7867391109466553, - "logits/rejected": -2.8338825702667236, - "logps/chosen": -221.8268280029297, - "logps/rejected": -323.56317138671875, - "loss": 0.0767, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8608427047729492, - "rewards/margins": 7.180264472961426, - "rewards/rejected": -6.319421768188477, + "logits/chosen": -2.7043755054473877, + "logits/rejected": -2.7354531288146973, + "logps/chosen": -213.41006469726562, + "logps/rejected": -314.4804992675781, + "loss": 0.0811, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.702521562576294, + "rewards/margins": 9.989812850952148, + "rewards/rejected": -8.287291526794434, "step": 2350 }, { "epoch": 1.22, "learning_rate": 3.2998661311914323e-07, - "logits/chosen": -2.7895936965942383, - "logits/rejected": -2.8425674438476562, - "logps/chosen": -241.6412811279297, - "logps/rejected": -352.6873779296875, - "loss": 0.0884, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.3408452272415161, - "rewards/margins": 8.829115867614746, - "rewards/rejected": -7.488271236419678, + "logits/chosen": -2.6938185691833496, + "logits/rejected": -2.7287726402282715, + "logps/chosen": -239.980712890625, + "logps/rejected": -293.25164794921875, + "loss": 0.0883, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5076091289520264, + "rewards/margins": 9.779886245727539, + "rewards/rejected": -8.272275924682617, "step": 2360 }, { "epoch": 1.22, "learning_rate": 3.2903040734366035e-07, - "logits/chosen": -2.835283041000366, - "logits/rejected": -2.8766496181488037, - "logps/chosen": -222.8701171875, - "logps/rejected": -350.6272888183594, - "loss": 0.0624, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3985366821289062, - "rewards/margins": 8.540349006652832, - "rewards/rejected": -7.141812324523926, + "logits/chosen": -2.753580093383789, + "logits/rejected": -2.794508218765259, + "logps/chosen": -217.73898315429688, + "logps/rejected": -306.1284484863281, + "loss": 0.0661, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9116510152816772, + "rewards/margins": 10.028429985046387, + "rewards/rejected": -8.116777420043945, "step": 2370 }, { "epoch": 1.23, "learning_rate": 3.2807420156817746e-07, - "logits/chosen": -2.733876943588257, - "logits/rejected": -2.7979674339294434, - "logps/chosen": -194.2627410888672, - "logps/rejected": -343.9058532714844, - "loss": 0.0898, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2534451484680176, - "rewards/margins": 7.484607696533203, - "rewards/rejected": -6.2311625480651855, + "logits/chosen": -2.6265764236450195, + "logits/rejected": -2.6648049354553223, + "logps/chosen": -197.09335327148438, + "logps/rejected": -321.7854309082031, + "loss": 0.0704, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9703874588012695, + "rewards/margins": 8.461416244506836, + "rewards/rejected": -7.491029262542725, "step": 2380 }, { "epoch": 1.23, "learning_rate": 3.271179957926946e-07, - "logits/chosen": -2.8532767295837402, - "logits/rejected": -2.884650707244873, - "logps/chosen": -263.845458984375, - "logps/rejected": -340.37127685546875, - "loss": 0.0768, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.687608540058136, - "rewards/margins": 7.081732273101807, - "rewards/rejected": -6.394123077392578, + "logits/chosen": -2.702601671218872, + "logits/rejected": -2.7486157417297363, + "logps/chosen": -261.2791442871094, + "logps/rejected": -309.7445373535156, + "loss": 0.0472, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9444568753242493, + "rewards/margins": 8.669207572937012, + "rewards/rejected": -7.7247514724731445, "step": 2390 }, { "epoch": 1.24, "learning_rate": 3.261617900172117e-07, - "logits/chosen": -2.8089540004730225, - "logits/rejected": -2.8392434120178223, - "logps/chosen": -250.58303833007812, - "logps/rejected": -347.5687561035156, - "loss": 0.1026, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.9663568735122681, - "rewards/margins": 6.710597991943359, - "rewards/rejected": -5.744241714477539, + "logits/chosen": -2.71286940574646, + "logits/rejected": -2.7070486545562744, + "logps/chosen": -249.5623016357422, + "logps/rejected": -305.1341857910156, + "loss": 0.1132, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0684294700622559, + "rewards/margins": 8.95272445678711, + "rewards/rejected": -7.8842949867248535, "step": 2400 }, { "epoch": 1.24, - "eval_logits/chosen": -2.8102970123291016, - "eval_logits/rejected": -2.8701605796813965, - "eval_logps/chosen": -246.5922393798828, - "eval_logps/rejected": -315.093994140625, - "eval_loss": 0.48624616861343384, - "eval_rewards/accuracies": 0.8159999847412109, - "eval_rewards/chosen": 0.1087898463010788, - "eval_rewards/margins": 4.004448413848877, - "eval_rewards/rejected": -3.8956587314605713, - "eval_runtime": 278.4354, - "eval_samples_per_second": 7.183, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.7427713871002197, + "eval_logits/rejected": -2.7845394611358643, + "eval_logps/chosen": -242.65687561035156, + "eval_logps/rejected": -289.41644287109375, + "eval_loss": 0.36191657185554504, + "eval_rewards/accuracies": 0.8619999885559082, + "eval_rewards/chosen": 0.5157621502876282, + "eval_rewards/margins": 6.8840227127075195, + "eval_rewards/rejected": -6.368260860443115, + "eval_runtime": 276.3627, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 0.452, "step": 2400 }, { "epoch": 1.24, "learning_rate": 3.2520558424172876e-07, - "logits/chosen": -2.8509275913238525, - "logits/rejected": -2.8675315380096436, - "logps/chosen": -216.6479034423828, - "logps/rejected": -314.75885009765625, - "loss": 0.0799, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.7996807098388672, - "rewards/margins": 7.730632781982422, - "rewards/rejected": -5.930952548980713, + "logits/chosen": -2.7597126960754395, + "logits/rejected": -2.790642023086548, + "logps/chosen": -216.7511749267578, + "logps/rejected": -280.5323181152344, + "loss": 0.1096, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7892688512802124, + "rewards/margins": 9.918553352355957, + "rewards/rejected": -8.129284858703613, "step": 2410 }, { "epoch": 1.25, "learning_rate": 3.242493784662459e-07, - "logits/chosen": -2.8042349815368652, - "logits/rejected": -2.8402154445648193, - "logps/chosen": -223.0919647216797, - "logps/rejected": -324.47357177734375, - "loss": 0.0694, + "logits/chosen": -2.73762845993042, + "logits/rejected": -2.7712318897247314, + "logps/chosen": -224.7061767578125, + "logps/rejected": -295.7691650390625, + "loss": 0.0421, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.4728593826293945, - "rewards/margins": 8.073456764221191, - "rewards/rejected": -6.600597381591797, + "rewards/chosen": 1.3114393949508667, + "rewards/margins": 10.272552490234375, + "rewards/rejected": -8.961112976074219, "step": 2420 }, { "epoch": 1.25, "learning_rate": 3.2329317269076304e-07, - "logits/chosen": -2.7997336387634277, - "logits/rejected": -2.8355021476745605, - "logps/chosen": -275.755615234375, - "logps/rejected": -410.38885498046875, - "loss": 0.0997, + "logits/chosen": -2.7739930152893066, + "logits/rejected": -2.7679646015167236, + "logps/chosen": -265.0934143066406, + "logps/rejected": -363.01947021484375, + "loss": 0.0707, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0294463634490967, - "rewards/margins": 8.784956932067871, - "rewards/rejected": -7.755509853363037, + "rewards/chosen": 2.0977931022644043, + "rewards/margins": 11.470593452453613, + "rewards/rejected": -9.372800827026367, "step": 2430 }, { "epoch": 1.26, "learning_rate": 3.2233696691528016e-07, - "logits/chosen": -2.7436885833740234, - "logits/rejected": -2.8203845024108887, - "logps/chosen": -198.19422912597656, - "logps/rejected": -370.61602783203125, - "loss": 0.1348, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.599268913269043, - "rewards/margins": 7.958105564117432, - "rewards/rejected": -6.3588361740112305, + "logits/chosen": -2.677910327911377, + "logits/rejected": -2.772629976272583, + "logps/chosen": -201.72589111328125, + "logps/rejected": -352.03997802734375, + "loss": 0.1574, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7549591064453125, + "rewards/margins": 10.211446762084961, + "rewards/rejected": -8.456486701965332, "step": 2440 }, { "epoch": 1.26, "learning_rate": 3.2138076113979727e-07, - "logits/chosen": -2.7402098178863525, - "logits/rejected": -2.773638963699341, - "logps/chosen": -274.2806396484375, - "logps/rejected": -379.9324035644531, - "loss": 0.0887, + "logits/chosen": -2.676964282989502, + "logits/rejected": -2.7039332389831543, + "logps/chosen": -265.99261474609375, + "logps/rejected": -364.07293701171875, + "loss": 0.0581, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.12480942904949188, - "rewards/margins": 7.0526933670043945, - "rewards/rejected": -6.927884101867676, + "rewards/chosen": 0.9536114931106567, + "rewards/margins": 9.440874099731445, + "rewards/rejected": -8.487262725830078, "step": 2450 }, { "epoch": 1.27, "learning_rate": 3.204245553643144e-07, - "logits/chosen": -2.7169342041015625, - "logits/rejected": -2.7519021034240723, - "logps/chosen": -303.33770751953125, - "logps/rejected": -361.6357421875, - "loss": 0.0583, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9514210820198059, - "rewards/margins": 7.6320390701293945, - "rewards/rejected": -6.680616855621338, + "logits/chosen": -2.6750779151916504, + "logits/rejected": -2.6993095874786377, + "logps/chosen": -301.03515625, + "logps/rejected": -337.91558837890625, + "loss": 0.0877, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1825854778289795, + "rewards/margins": 10.256135940551758, + "rewards/rejected": -9.073549270629883, "step": 2460 }, { "epoch": 1.28, "learning_rate": 3.194683495888315e-07, - "logits/chosen": -2.7771124839782715, - "logits/rejected": -2.83833646774292, - "logps/chosen": -237.21743774414062, - "logps/rejected": -352.67486572265625, - "loss": 0.1615, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3097350597381592, - "rewards/margins": 7.885691165924072, - "rewards/rejected": -6.575955867767334, + "logits/chosen": -2.756789207458496, + "logits/rejected": -2.8234970569610596, + "logps/chosen": -234.93212890625, + "logps/rejected": -293.1192932128906, + "loss": 0.0356, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5365440845489502, + "rewards/margins": 10.162076950073242, + "rewards/rejected": -8.625533103942871, "step": 2470 }, { "epoch": 1.28, "learning_rate": 3.185121438133486e-07, - "logits/chosen": -2.7796790599823, - "logits/rejected": -2.78997540473938, - "logps/chosen": -293.5530700683594, - "logps/rejected": -425.14208984375, - "loss": 0.0732, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.4847608804702759, - "rewards/margins": 8.36500358581543, - "rewards/rejected": -6.88024377822876, + "logits/chosen": -2.754288911819458, + "logits/rejected": -2.7445130348205566, + "logps/chosen": -292.94659423828125, + "logps/rejected": -370.8139953613281, + "loss": 0.0525, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5465762615203857, + "rewards/margins": 11.008524894714355, + "rewards/rejected": -9.46194839477539, "step": 2480 }, { "epoch": 1.29, "learning_rate": 3.1755593803786574e-07, - "logits/chosen": -2.8316915035247803, - "logits/rejected": -2.8410391807556152, - "logps/chosen": -223.18466186523438, - "logps/rejected": -332.941162109375, - "loss": 0.0682, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.1314014196395874, - "rewards/margins": 8.023019790649414, - "rewards/rejected": -6.891618251800537, + "logits/chosen": -2.7772982120513916, + "logits/rejected": -2.7778236865997314, + "logps/chosen": -230.0045928955078, + "logps/rejected": -288.295166015625, + "loss": 0.0476, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.9591878056526184, + "rewards/margins": 10.264433860778809, + "rewards/rejected": -9.305246353149414, "step": 2490 }, { "epoch": 1.29, "learning_rate": 3.1659973226238285e-07, - "logits/chosen": -2.8022923469543457, - "logits/rejected": -2.8136115074157715, - "logps/chosen": -260.37603759765625, - "logps/rejected": -328.55810546875, - "loss": 0.104, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.0766308307647705, - "rewards/margins": 7.775477409362793, - "rewards/rejected": -6.69884729385376, + "logits/chosen": -2.773174285888672, + "logits/rejected": -2.773740768432617, + "logps/chosen": -264.3467712402344, + "logps/rejected": -290.15985107421875, + "loss": 0.0455, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6024417877197266, + "rewards/margins": 10.383742332458496, + "rewards/rejected": -8.781301498413086, "step": 2500 }, { "epoch": 1.29, - "eval_logits/chosen": -2.7535407543182373, - "eval_logits/rejected": -2.810478448867798, - "eval_logps/chosen": -253.7227783203125, - "eval_logps/rejected": -326.864013671875, - "eval_loss": 0.5141146779060364, - "eval_rewards/accuracies": 0.8080000281333923, - "eval_rewards/chosen": -0.60426265001297, - "eval_rewards/margins": 4.468394756317139, - "eval_rewards/rejected": -5.072656631469727, - "eval_runtime": 279.4079, - "eval_samples_per_second": 7.158, - "eval_steps_per_second": 0.447, + "eval_logits/chosen": -2.8059160709381104, + "eval_logits/rejected": -2.8465514183044434, + "eval_logps/chosen": -244.03248596191406, + "eval_logps/rejected": -295.3028869628906, + "eval_loss": 0.3457484245300293, + "eval_rewards/accuracies": 0.8679999709129333, + "eval_rewards/chosen": 0.3782012164592743, + "eval_rewards/margins": 7.335104465484619, + "eval_rewards/rejected": -6.956903457641602, + "eval_runtime": 276.162, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 0.453, "step": 2500 }, { "epoch": 1.3, "learning_rate": 3.1564352648689997e-07, - "logits/chosen": -2.826930522918701, - "logits/rejected": -2.872891902923584, - "logps/chosen": -247.903564453125, - "logps/rejected": -338.798095703125, - "loss": 0.1311, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.7270460724830627, - "rewards/margins": 7.407177925109863, - "rewards/rejected": -6.680130958557129, + "logits/chosen": -2.8566792011260986, + "logits/rejected": -2.8793201446533203, + "logps/chosen": -241.57119750976562, + "logps/rejected": -295.3194885253906, + "loss": 0.0701, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.3355607986450195, + "rewards/margins": 10.345914840698242, + "rewards/rejected": -9.010354995727539, "step": 2510 }, { "epoch": 1.3, "learning_rate": 3.146873207114171e-07, - "logits/chosen": -2.749908447265625, - "logits/rejected": -2.7867674827575684, - "logps/chosen": -194.88047790527344, - "logps/rejected": -334.09234619140625, - "loss": 0.0771, + "logits/chosen": -2.807373046875, + "logits/rejected": -2.8323373794555664, + "logps/chosen": -194.2996826171875, + "logps/rejected": -293.05303955078125, + "loss": 0.0701, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3872716426849365, - "rewards/margins": 8.179627418518066, - "rewards/rejected": -6.792355537414551, + "rewards/chosen": 1.4453535079956055, + "rewards/margins": 9.924449920654297, + "rewards/rejected": -8.479095458984375, "step": 2520 }, { "epoch": 1.31, "learning_rate": 3.137311149359342e-07, - "logits/chosen": -2.790775775909424, - "logits/rejected": -2.8377020359039307, - "logps/chosen": -282.1646423339844, - "logps/rejected": -381.095703125, - "loss": 0.091, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0731854438781738, - "rewards/margins": 6.98352575302124, - "rewards/rejected": -5.910340309143066, + "logits/chosen": -2.833449125289917, + "logits/rejected": -2.847416400909424, + "logps/chosen": -280.81231689453125, + "logps/rejected": -344.94842529296875, + "loss": 0.0852, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.323650598526001, + "rewards/margins": 8.957289695739746, + "rewards/rejected": -7.633638858795166, "step": 2530 }, { "epoch": 1.31, "learning_rate": 3.127749091604513e-07, - "logits/chosen": -2.8378074169158936, - "logits/rejected": -2.8901278972625732, - "logps/chosen": -307.4638977050781, - "logps/rejected": -385.1072998046875, - "loss": 0.074, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.6109613180160522, - "rewards/margins": 8.63659381866455, - "rewards/rejected": -7.025631904602051, + "logits/chosen": -2.792257308959961, + "logits/rejected": -2.7943286895751953, + "logps/chosen": -308.4271545410156, + "logps/rejected": -309.96929931640625, + "loss": 0.0595, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5143873691558838, + "rewards/margins": 10.729879379272461, + "rewards/rejected": -9.215494155883789, "step": 2540 }, { "epoch": 1.32, "learning_rate": 3.1181870338496843e-07, - "logits/chosen": -2.825392007827759, - "logits/rejected": -2.8577558994293213, - "logps/chosen": -219.1394805908203, - "logps/rejected": -337.9822692871094, - "loss": 0.1222, + "logits/chosen": -2.7642436027526855, + "logits/rejected": -2.7878854274749756, + "logps/chosen": -214.78857421875, + "logps/rejected": -299.8769836425781, + "loss": 0.0486, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.7719315886497498, - "rewards/margins": 6.5245184898376465, - "rewards/rejected": -5.752586364746094, + "rewards/chosen": 1.2070233821868896, + "rewards/margins": 8.471216201782227, + "rewards/rejected": -7.2641921043396, "step": 2550 }, { "epoch": 1.32, "learning_rate": 3.108624976094856e-07, - "logits/chosen": -2.869966506958008, - "logits/rejected": -2.9369266033172607, - "logps/chosen": -232.96163940429688, - "logps/rejected": -306.9654235839844, - "loss": 0.1048, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.9050451517105103, - "rewards/margins": 7.699380397796631, - "rewards/rejected": -5.794335842132568, + "logits/chosen": -2.7631824016571045, + "logits/rejected": -2.8334383964538574, + "logps/chosen": -236.7304229736328, + "logps/rejected": -275.130615234375, + "loss": 0.1542, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5281670093536377, + "rewards/margins": 9.417257308959961, + "rewards/rejected": -7.889090061187744, "step": 2560 }, { "epoch": 1.33, "learning_rate": 3.0990629183400266e-07, - "logits/chosen": -2.839395046234131, - "logits/rejected": -2.8635356426239014, - "logps/chosen": -239.1020050048828, - "logps/rejected": -366.38623046875, - "loss": 0.1204, + "logits/chosen": -2.8007915019989014, + "logits/rejected": -2.8007919788360596, + "logps/chosen": -240.93692016601562, + "logps/rejected": -316.1221008300781, + "loss": 0.0755, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.9102404117584229, - "rewards/margins": 9.183894157409668, - "rewards/rejected": -7.273654937744141, + "rewards/chosen": 1.9544613361358643, + "rewards/margins": 10.83757209777832, + "rewards/rejected": -8.883111953735352, "step": 2570 }, { "epoch": 1.33, "learning_rate": 3.089500860585198e-07, - "logits/chosen": -2.829850912094116, - "logits/rejected": -2.887627363204956, - "logps/chosen": -200.79571533203125, - "logps/rejected": -296.86932373046875, - "loss": 0.1213, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3977177143096924, - "rewards/margins": 6.472372531890869, - "rewards/rejected": -6.074654579162598, + "logits/chosen": -2.7547965049743652, + "logits/rejected": -2.8169667720794678, + "logps/chosen": -193.8870086669922, + "logps/rejected": -294.17974853515625, + "loss": 0.0539, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0885858535766602, + "rewards/margins": 10.255282402038574, + "rewards/rejected": -9.16669750213623, "step": 2580 }, { "epoch": 1.34, "learning_rate": 3.079938802830369e-07, - "logits/chosen": -2.911804676055908, - "logits/rejected": -2.9058237075805664, - "logps/chosen": -226.080810546875, - "logps/rejected": -261.0593566894531, - "loss": 0.0687, + "logits/chosen": -2.856707811355591, + "logits/rejected": -2.8490512371063232, + "logps/chosen": -225.03219604492188, + "logps/rejected": -241.29031372070312, + "loss": 0.0395, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.5239999294281006, - "rewards/margins": 7.221141815185547, - "rewards/rejected": -5.697141647338867, + "rewards/chosen": 1.6288611888885498, + "rewards/margins": 9.060091018676758, + "rewards/rejected": -7.431227684020996, "step": 2590 }, { "epoch": 1.34, "learning_rate": 3.07037674507554e-07, - "logits/chosen": -2.8087925910949707, - "logits/rejected": -2.8416666984558105, - "logps/chosen": -226.8069610595703, - "logps/rejected": -407.0137939453125, - "loss": 0.0728, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.4006208181381226, - "rewards/margins": 7.8127875328063965, - "rewards/rejected": -6.412166595458984, + "logits/chosen": -2.745065212249756, + "logits/rejected": -2.7855966091156006, + "logps/chosen": -220.8400115966797, + "logps/rejected": -331.33599853515625, + "loss": 0.0506, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.051476001739502, + "rewards/margins": 10.355587005615234, + "rewards/rejected": -8.30411148071289, "step": 2600 }, { "epoch": 1.34, - "eval_logits/chosen": -2.8015549182891846, - "eval_logits/rejected": -2.86586594581604, - "eval_logps/chosen": -253.48963928222656, - "eval_logps/rejected": -326.0744323730469, - "eval_loss": 0.5166017413139343, - "eval_rewards/accuracies": 0.8080000281333923, - "eval_rewards/chosen": -0.5809494256973267, - "eval_rewards/margins": 4.412755012512207, - "eval_rewards/rejected": -4.993704319000244, - "eval_runtime": 278.9732, - "eval_samples_per_second": 7.169, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.8132107257843018, + "eval_logits/rejected": -2.855024814605713, + "eval_logps/chosen": -242.45858764648438, + "eval_logps/rejected": -292.61236572265625, + "eval_loss": 0.3638141453266144, + "eval_rewards/accuracies": 0.8740000128746033, + "eval_rewards/chosen": 0.5355889201164246, + "eval_rewards/margins": 7.223442077636719, + "eval_rewards/rejected": -6.6878533363342285, + "eval_runtime": 276.1989, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.453, "step": 2600 }, { "epoch": 1.35, "learning_rate": 3.060814687320711e-07, - "logits/chosen": -2.804701089859009, - "logits/rejected": -2.8087198734283447, - "logps/chosen": -262.0416564941406, - "logps/rejected": -343.32769775390625, - "loss": 0.0779, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0394731760025024, - "rewards/margins": 7.696552276611328, - "rewards/rejected": -6.657078742980957, + "logits/chosen": -2.7801144123077393, + "logits/rejected": -2.7822697162628174, + "logps/chosen": -254.675537109375, + "logps/rejected": -312.4071044921875, + "loss": 0.045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7761180400848389, + "rewards/margins": 10.633474349975586, + "rewards/rejected": -8.857357025146484, "step": 2610 }, { "epoch": 1.35, "learning_rate": 3.0512526295658824e-07, - "logits/chosen": -2.7707972526550293, - "logits/rejected": -2.8042078018188477, - "logps/chosen": -240.3668975830078, - "logps/rejected": -345.0741271972656, - "loss": 0.0837, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.2935864925384521, - "rewards/margins": 8.221081733703613, - "rewards/rejected": -6.92749547958374, + "logits/chosen": -2.7555644512176514, + "logits/rejected": -2.7959461212158203, + "logps/chosen": -232.75912475585938, + "logps/rejected": -317.05584716796875, + "loss": 0.0635, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2575905323028564, + "rewards/margins": 10.738282203674316, + "rewards/rejected": -8.480690956115723, "step": 2620 }, { "epoch": 1.36, "learning_rate": 3.0416905718110536e-07, - "logits/chosen": -2.9000189304351807, - "logits/rejected": -2.9776604175567627, - "logps/chosen": -212.4930877685547, - "logps/rejected": -303.805908203125, - "loss": 0.1076, + "logits/chosen": -2.786776065826416, + "logits/rejected": -2.8622710704803467, + "logps/chosen": -206.95986938476562, + "logps/rejected": -267.4670104980469, + "loss": 0.1234, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.9751327633857727, - "rewards/margins": 7.801082611083984, - "rewards/rejected": -6.825949192047119, + "rewards/chosen": 1.5284591913223267, + "rewards/margins": 8.781716346740723, + "rewards/rejected": -7.253256320953369, "step": 2630 }, { "epoch": 1.36, "learning_rate": 3.0321285140562247e-07, - "logits/chosen": -2.8881120681762695, - "logits/rejected": -2.9165701866149902, - "logps/chosen": -246.6224365234375, - "logps/rejected": -306.96014404296875, - "loss": 0.092, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.041773706674575806, - "rewards/margins": 6.964110374450684, - "rewards/rejected": -6.922336578369141, + "logits/chosen": -2.7850725650787354, + "logits/rejected": -2.7539660930633545, + "logps/chosen": -236.6200408935547, + "logps/rejected": -258.26885986328125, + "loss": 0.0492, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0427087545394897, + "rewards/margins": 9.852095603942871, + "rewards/rejected": -8.809386253356934, "step": 2640 }, { "epoch": 1.37, "learning_rate": 3.022566456301396e-07, - "logits/chosen": -2.865769147872925, - "logits/rejected": -2.8971917629241943, - "logps/chosen": -223.96884155273438, - "logps/rejected": -329.19268798828125, - "loss": 0.1457, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.885826587677002, - "rewards/margins": 7.315209865570068, - "rewards/rejected": -6.429383754730225, + "logits/chosen": -2.7758936882019043, + "logits/rejected": -2.7873213291168213, + "logps/chosen": -216.4073944091797, + "logps/rejected": -297.9481506347656, + "loss": 0.1137, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6412105560302734, + "rewards/margins": 10.181042671203613, + "rewards/rejected": -8.539831161499023, "step": 2650 }, { "epoch": 1.37, "learning_rate": 3.013004398546567e-07, - "logits/chosen": -2.846896171569824, - "logits/rejected": -2.8020544052124023, - "logps/chosen": -225.41665649414062, - "logps/rejected": -355.18658447265625, - "loss": 0.104, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.7547341585159302, - "rewards/margins": 7.292550563812256, - "rewards/rejected": -6.537816047668457, + "logits/chosen": -2.7363452911376953, + "logits/rejected": -2.7088732719421387, + "logps/chosen": -219.45266723632812, + "logps/rejected": -353.4142150878906, + "loss": 0.0587, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3663221597671509, + "rewards/margins": 8.88754653930664, + "rewards/rejected": -7.521223545074463, "step": 2660 }, { "epoch": 1.38, "learning_rate": 3.003442340791738e-07, - "logits/chosen": -2.9464218616485596, - "logits/rejected": -2.9827699661254883, - "logps/chosen": -226.2906494140625, - "logps/rejected": -321.0039978027344, - "loss": 0.0758, + "logits/chosen": -2.8551442623138428, + "logits/rejected": -2.8802449703216553, + "logps/chosen": -232.0399932861328, + "logps/rejected": -279.8020324707031, + "loss": 0.0526, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8007649183273315, - "rewards/margins": 7.7752275466918945, - "rewards/rejected": -6.974462032318115, + "rewards/chosen": 0.835440456867218, + "rewards/margins": 9.41199016571045, + "rewards/rejected": -8.576549530029297, "step": 2670 }, { "epoch": 1.38, "learning_rate": 2.9938802830369093e-07, - "logits/chosen": -2.916825771331787, - "logits/rejected": -2.9472413063049316, - "logps/chosen": -264.4847412109375, - "logps/rejected": -352.3399353027344, - "loss": 0.1488, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.8589404225349426, - "rewards/margins": 8.047723770141602, - "rewards/rejected": -7.188782691955566, + "logits/chosen": -2.826984167098999, + "logits/rejected": -2.846038341522217, + "logps/chosen": -261.65374755859375, + "logps/rejected": -290.5477600097656, + "loss": 0.0874, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.1431066989898682, + "rewards/margins": 10.978265762329102, + "rewards/rejected": -9.835159301757812, "step": 2680 }, { "epoch": 1.39, "learning_rate": 2.9843182252820805e-07, - "logits/chosen": -2.893075704574585, - "logits/rejected": -2.9288330078125, - "logps/chosen": -200.18128967285156, - "logps/rejected": -332.02392578125, - "loss": 0.097, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.269403100013733, - "rewards/margins": 8.271394729614258, - "rewards/rejected": -7.001992702484131, + "logits/chosen": -2.7579092979431152, + "logits/rejected": -2.7959165573120117, + "logps/chosen": -197.5676727294922, + "logps/rejected": -315.44342041015625, + "loss": 0.0596, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5481231212615967, + "rewards/margins": 11.245332717895508, + "rewards/rejected": -9.697210311889648, "step": 2690 }, { "epoch": 1.39, "learning_rate": 2.974756167527252e-07, - "logits/chosen": -2.94435715675354, - "logits/rejected": -2.9935834407806396, - "logps/chosen": -218.6113739013672, - "logps/rejected": -337.3885803222656, - "loss": 0.0844, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4172857403755188, - "rewards/margins": 6.798937797546387, - "rewards/rejected": -6.38165283203125, + "logits/chosen": -2.8283803462982178, + "logits/rejected": -2.860241413116455, + "logps/chosen": -217.2318572998047, + "logps/rejected": -292.4986877441406, + "loss": 0.0561, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.557071328163147, + "rewards/margins": 9.96302604675293, + "rewards/rejected": -9.405956268310547, "step": 2700 }, { "epoch": 1.39, - "eval_logits/chosen": -2.8305044174194336, - "eval_logits/rejected": -2.8900794982910156, - "eval_logps/chosen": -253.89149475097656, - "eval_logps/rejected": -322.5744323730469, - "eval_loss": 0.4835141599178314, - "eval_rewards/accuracies": 0.8159999847412109, - "eval_rewards/chosen": -0.6211313009262085, - "eval_rewards/margins": 4.022569179534912, - "eval_rewards/rejected": -4.64370059967041, - "eval_runtime": 278.7686, - "eval_samples_per_second": 7.174, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.76045823097229, + "eval_logits/rejected": -2.7937419414520264, + "eval_logps/chosen": -247.61495971679688, + "eval_logps/rejected": -294.6885070800781, + "eval_loss": 0.34291791915893555, + "eval_rewards/accuracies": 0.8539999723434448, + "eval_rewards/chosen": 0.01995224691927433, + "eval_rewards/margins": 6.915416240692139, + "eval_rewards/rejected": -6.895462989807129, + "eval_runtime": 276.2702, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 0.452, "step": 2700 }, { "epoch": 1.4, "learning_rate": 2.9651941097724233e-07, - "logits/chosen": -2.9109933376312256, - "logits/rejected": -2.9616150856018066, - "logps/chosen": -247.9290008544922, - "logps/rejected": -294.053466796875, - "loss": 0.1869, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.8045139312744141, - "rewards/margins": 6.199316501617432, - "rewards/rejected": -5.394802093505859, + "logits/chosen": -2.7905402183532715, + "logits/rejected": -2.8540217876434326, + "logps/chosen": -245.46163940429688, + "logps/rejected": -287.2901611328125, + "loss": 0.148, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0664794445037842, + "rewards/margins": 8.838623046875, + "rewards/rejected": -7.7721428871154785, "step": 2710 }, { "epoch": 1.4, "learning_rate": 2.9556320520175945e-07, - "logits/chosen": -2.915855884552002, - "logits/rejected": -2.9131040573120117, - "logps/chosen": -244.89242553710938, - "logps/rejected": -310.23931884765625, - "loss": 0.1395, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6600807905197144, - "rewards/margins": 7.267942905426025, - "rewards/rejected": -6.6078619956970215, + "logits/chosen": -2.7694029808044434, + "logits/rejected": -2.7353501319885254, + "logps/chosen": -245.0088348388672, + "logps/rejected": -288.64227294921875, + "loss": 0.0934, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.6484416127204895, + "rewards/margins": 10.08415412902832, + "rewards/rejected": -9.435712814331055, "step": 2720 }, { "epoch": 1.41, "learning_rate": 2.946069994262765e-07, - "logits/chosen": -2.9800467491149902, - "logits/rejected": -2.9676592350006104, - "logps/chosen": -240.47412109375, - "logps/rejected": -379.64306640625, - "loss": 0.1346, + "logits/chosen": -2.849344253540039, + "logits/rejected": -2.8188083171844482, + "logps/chosen": -235.24935913085938, + "logps/rejected": -330.847900390625, + "loss": 0.0738, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6723322868347168, - "rewards/margins": 7.420382499694824, - "rewards/rejected": -6.748049736022949, + "rewards/chosen": 1.19480562210083, + "rewards/margins": 9.427337646484375, + "rewards/rejected": -8.232531547546387, "step": 2730 }, { "epoch": 1.41, "learning_rate": 2.9365079365079363e-07, - "logits/chosen": -2.9388279914855957, - "logits/rejected": -2.9169228076934814, - "logps/chosen": -240.00277709960938, - "logps/rejected": -291.1402282714844, - "loss": 0.0738, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0642039775848389, - "rewards/margins": 7.567967891693115, - "rewards/rejected": -6.5037641525268555, + "logits/chosen": -2.807582378387451, + "logits/rejected": -2.815614938735962, + "logps/chosen": -232.57577514648438, + "logps/rejected": -265.2566833496094, + "loss": 0.0804, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.806540846824646, + "rewards/margins": 9.591080665588379, + "rewards/rejected": -7.784539222717285, "step": 2740 }, { "epoch": 1.42, "learning_rate": 2.9269458787531074e-07, - "logits/chosen": -2.854447603225708, - "logits/rejected": -2.8605027198791504, - "logps/chosen": -214.74441528320312, - "logps/rejected": -344.01983642578125, - "loss": 0.1133, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.8734567761421204, - "rewards/margins": 6.573145866394043, - "rewards/rejected": -5.699689865112305, + "logits/chosen": -2.752331256866455, + "logits/rejected": -2.733701229095459, + "logps/chosen": -213.12020874023438, + "logps/rejected": -332.69036865234375, + "loss": 0.0757, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0358021259307861, + "rewards/margins": 9.005339622497559, + "rewards/rejected": -7.969538688659668, "step": 2750 }, { "epoch": 1.42, "learning_rate": 2.9173838209982786e-07, - "logits/chosen": -2.8964381217956543, - "logits/rejected": -2.9152231216430664, - "logps/chosen": -248.6080780029297, - "logps/rejected": -294.92474365234375, - "loss": 0.0746, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7927258610725403, - "rewards/margins": 7.022311210632324, - "rewards/rejected": -6.229586601257324, + "logits/chosen": -2.8155770301818848, + "logits/rejected": -2.7966866493225098, + "logps/chosen": -246.49020385742188, + "logps/rejected": -239.0867462158203, + "loss": 0.0354, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.0047332048416138, + "rewards/margins": 11.007787704467773, + "rewards/rejected": -10.003053665161133, "step": 2760 }, { "epoch": 1.43, "learning_rate": 2.90782176324345e-07, - "logits/chosen": -2.9137072563171387, - "logits/rejected": -2.9111227989196777, - "logps/chosen": -331.26812744140625, - "logps/rejected": -336.87799072265625, - "loss": 0.1427, + "logits/chosen": -2.7919669151306152, + "logits/rejected": -2.8168044090270996, + "logps/chosen": -327.8219909667969, + "logps/rejected": -311.0113525390625, + "loss": 0.1091, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7486933469772339, - "rewards/margins": 7.747340202331543, - "rewards/rejected": -6.998647212982178, + "rewards/chosen": 1.0934474468231201, + "rewards/margins": 9.316500663757324, + "rewards/rejected": -8.223054885864258, "step": 2770 }, { "epoch": 1.44, "learning_rate": 2.898259705488621e-07, - "logits/chosen": -2.950000047683716, - "logits/rejected": -3.0376858711242676, - "logps/chosen": -167.78564453125, - "logps/rejected": -316.28973388671875, - "loss": 0.0717, + "logits/chosen": -2.845902681350708, + "logits/rejected": -2.9347774982452393, + "logps/chosen": -163.2794647216797, + "logps/rejected": -310.783447265625, + "loss": 0.0449, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9677039980888367, - "rewards/margins": 7.978043556213379, - "rewards/rejected": -7.010340213775635, + "rewards/chosen": 1.418323278427124, + "rewards/margins": 10.092653274536133, + "rewards/rejected": -8.67432975769043, "step": 2780 }, { "epoch": 1.44, "learning_rate": 2.888697647733792e-07, - "logits/chosen": -2.950089693069458, - "logits/rejected": -2.9753165245056152, - "logps/chosen": -271.12347412109375, - "logps/rejected": -380.61383056640625, - "loss": 0.1185, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3904229402542114, - "rewards/margins": 9.421290397644043, - "rewards/rejected": -8.030868530273438, + "logits/chosen": -2.848527431488037, + "logits/rejected": -2.8778862953186035, + "logps/chosen": -268.2187805175781, + "logps/rejected": -355.076416015625, + "loss": 0.1047, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.683945655822754, + "rewards/margins": 11.462151527404785, + "rewards/rejected": -9.778204917907715, "step": 2790 }, { "epoch": 1.45, "learning_rate": 2.879135589978963e-07, - "logits/chosen": -2.8828554153442383, - "logits/rejected": -2.95314359664917, - "logps/chosen": -220.3723602294922, - "logps/rejected": -351.38153076171875, - "loss": 0.0733, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.192879319190979, - "rewards/margins": 7.078371524810791, - "rewards/rejected": -5.885491371154785, + "logits/chosen": -2.8241419792175293, + "logits/rejected": -2.895460605621338, + "logps/chosen": -214.8736572265625, + "logps/rejected": -325.08441162109375, + "loss": 0.0744, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.741660714149475, + "rewards/margins": 9.883358001708984, + "rewards/rejected": -8.141695022583008, "step": 2800 }, { "epoch": 1.45, - "eval_logits/chosen": -2.8813791275024414, - "eval_logits/rejected": -2.931053876876831, - "eval_logps/chosen": -249.5429229736328, - "eval_logps/rejected": -317.8975524902344, - "eval_loss": 0.47381946444511414, - "eval_rewards/accuracies": 0.8119999766349792, - "eval_rewards/chosen": -0.18627841770648956, - "eval_rewards/margins": 3.9897348880767822, - "eval_rewards/rejected": -4.176013469696045, - "eval_runtime": 278.3124, - "eval_samples_per_second": 7.186, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.854137897491455, + "eval_logits/rejected": -2.886112928390503, + "eval_logps/chosen": -243.70782470703125, + "eval_logps/rejected": -284.8834228515625, + "eval_loss": 0.3599693179130554, + "eval_rewards/accuracies": 0.8579999804496765, + "eval_rewards/chosen": 0.4106691777706146, + "eval_rewards/margins": 6.325629234313965, + "eval_rewards/rejected": -5.9149603843688965, + "eval_runtime": 276.1433, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 0.453, "step": 2800 }, { "epoch": 1.45, "learning_rate": 2.8695735322241344e-07, - "logits/chosen": -2.9480504989624023, - "logits/rejected": -2.9745919704437256, - "logps/chosen": -222.18984985351562, - "logps/rejected": -341.6588439941406, - "loss": 0.0824, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.742396593093872, - "rewards/margins": 8.565423011779785, - "rewards/rejected": -6.823026180267334, + "logits/chosen": -2.887256145477295, + "logits/rejected": -2.9048309326171875, + "logps/chosen": -218.23593139648438, + "logps/rejected": -301.92962646484375, + "loss": 0.1199, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1383767127990723, + "rewards/margins": 10.393625259399414, + "rewards/rejected": -8.2552490234375, "step": 2810 }, { "epoch": 1.46, "learning_rate": 2.8600114744693055e-07, - "logits/chosen": -2.8825483322143555, - "logits/rejected": -2.901355266571045, - "logps/chosen": -202.0137176513672, - "logps/rejected": -344.7021789550781, - "loss": 0.0785, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.592545986175537, - "rewards/margins": 8.239890098571777, - "rewards/rejected": -6.64734411239624, + "logits/chosen": -2.7940280437469482, + "logits/rejected": -2.832387924194336, + "logps/chosen": -200.12815856933594, + "logps/rejected": -319.68115234375, + "loss": 0.0553, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.7811024188995361, + "rewards/margins": 10.244537353515625, + "rewards/rejected": -8.463435173034668, "step": 2820 }, { "epoch": 1.46, "learning_rate": 2.8504494167144767e-07, - "logits/chosen": -2.945234775543213, - "logits/rejected": -2.946733236312866, - "logps/chosen": -282.22637939453125, - "logps/rejected": -344.6810302734375, - "loss": 0.0707, + "logits/chosen": -2.887563467025757, + "logits/rejected": -2.8253684043884277, + "logps/chosen": -285.80450439453125, + "logps/rejected": -292.8382873535156, + "loss": 0.0963, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0531352758407593, - "rewards/margins": 6.8026628494262695, - "rewards/rejected": -5.749527454376221, + "rewards/chosen": 1.0141702890396118, + "rewards/margins": 9.71042537689209, + "rewards/rejected": -8.69625473022461, "step": 2830 }, { "epoch": 1.47, "learning_rate": 2.8408873589596484e-07, - "logits/chosen": -2.9102578163146973, - "logits/rejected": -2.9559600353240967, - "logps/chosen": -233.95407104492188, - "logps/rejected": -308.8085632324219, - "loss": 0.0953, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3472100496292114, - "rewards/margins": 7.695229530334473, - "rewards/rejected": -6.348019599914551, + "logits/chosen": -2.821132183074951, + "logits/rejected": -2.8314313888549805, + "logps/chosen": -232.6450958251953, + "logps/rejected": -286.07867431640625, + "loss": 0.057, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.47810959815979, + "rewards/margins": 10.077777862548828, + "rewards/rejected": -8.599668502807617, "step": 2840 }, { "epoch": 1.47, "learning_rate": 2.8313253012048195e-07, - "logits/chosen": -2.8096060752868652, - "logits/rejected": -2.82187557220459, - "logps/chosen": -232.92648315429688, - "logps/rejected": -363.13311767578125, - "loss": 0.1149, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.446784496307373, - "rewards/margins": 7.554309844970703, - "rewards/rejected": -6.107525825500488, + "logits/chosen": -2.668931245803833, + "logits/rejected": -2.6988089084625244, + "logps/chosen": -229.1149139404297, + "logps/rejected": -339.18316650390625, + "loss": 0.087, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 2.2288527488708496, + "rewards/margins": 10.593650817871094, + "rewards/rejected": -8.364797592163086, "step": 2850 }, { "epoch": 1.48, "learning_rate": 2.8217632434499907e-07, - "logits/chosen": -2.884981632232666, - "logits/rejected": -2.9340274333953857, - "logps/chosen": -282.1310729980469, - "logps/rejected": -352.1937561035156, - "loss": 0.0681, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.166867971420288, - "rewards/margins": 7.277437686920166, - "rewards/rejected": -6.110569000244141, + "logits/chosen": -2.8059301376342773, + "logits/rejected": -2.8067476749420166, + "logps/chosen": -277.11492919921875, + "logps/rejected": -315.45123291015625, + "loss": 0.0496, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6698763370513916, + "rewards/margins": 8.706342697143555, + "rewards/rejected": -7.036465644836426, "step": 2860 }, { "epoch": 1.48, "learning_rate": 2.812201185695162e-07, - "logits/chosen": -2.9189248085021973, - "logits/rejected": -2.939964771270752, - "logps/chosen": -189.5813751220703, - "logps/rejected": -333.9523010253906, - "loss": 0.0679, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.9171149730682373, - "rewards/margins": 7.77141809463501, - "rewards/rejected": -5.854302406311035, + "logits/chosen": -2.833714723587036, + "logits/rejected": -2.8098435401916504, + "logps/chosen": -193.87815856933594, + "logps/rejected": -275.5238952636719, + "loss": 0.0411, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.8069305419921875, + "rewards/margins": 10.389324188232422, + "rewards/rejected": -8.582392692565918, "step": 2870 }, { "epoch": 1.49, "learning_rate": 2.802639127940333e-07, - "logits/chosen": -2.889430046081543, - "logits/rejected": -2.902904748916626, - "logps/chosen": -226.6869659423828, - "logps/rejected": -277.851806640625, - "loss": 0.1103, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.285125732421875, - "rewards/margins": 7.303654670715332, - "rewards/rejected": -6.018527507781982, + "logits/chosen": -2.8054168224334717, + "logits/rejected": -2.808819055557251, + "logps/chosen": -235.84619140625, + "logps/rejected": -246.9005584716797, + "loss": 0.0684, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.1296446323394775, + "rewards/margins": 9.52583122253418, + "rewards/rejected": -8.396186828613281, "step": 2880 }, { "epoch": 1.49, "learning_rate": 2.7930770701855036e-07, - "logits/chosen": -2.9159200191497803, - "logits/rejected": -2.9363036155700684, - "logps/chosen": -252.4503936767578, - "logps/rejected": -314.83441162109375, - "loss": 0.0992, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2371034622192383, - "rewards/margins": 7.61843204498291, - "rewards/rejected": -6.381328582763672, + "logits/chosen": -2.8294785022735596, + "logits/rejected": -2.810607671737671, + "logps/chosen": -245.90664672851562, + "logps/rejected": -288.733642578125, + "loss": 0.0781, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.8571436405181885, + "rewards/margins": 10.793990135192871, + "rewards/rejected": -8.936845779418945, "step": 2890 }, { "epoch": 1.5, "learning_rate": 2.783515012430675e-07, - "logits/chosen": -2.8979902267456055, - "logits/rejected": -2.880366086959839, - "logps/chosen": -231.14559936523438, - "logps/rejected": -334.34869384765625, - "loss": 0.1837, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.5941873788833618, - "rewards/margins": 7.379087924957275, - "rewards/rejected": -5.784900188446045, + "logits/chosen": -2.7668967247009277, + "logits/rejected": -2.750340461730957, + "logps/chosen": -229.5404510498047, + "logps/rejected": -274.2742614746094, + "loss": 0.0542, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7575082778930664, + "rewards/margins": 10.381665229797363, + "rewards/rejected": -8.624155044555664, "step": 2900 }, { "epoch": 1.5, - "eval_logits/chosen": -2.8720028400421143, - "eval_logits/rejected": -2.9294917583465576, - "eval_logps/chosen": -247.88092041015625, - "eval_logps/rejected": -318.89837646484375, - "eval_loss": 0.47644469141960144, - "eval_rewards/accuracies": 0.8059999942779541, - "eval_rewards/chosen": -0.02007720246911049, - "eval_rewards/margins": 4.2560200691223145, - "eval_rewards/rejected": -4.276097774505615, - "eval_runtime": 278.8337, - "eval_samples_per_second": 7.173, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.76845121383667, + "eval_logits/rejected": -2.7995309829711914, + "eval_logps/chosen": -243.07557678222656, + "eval_logps/rejected": -291.58514404296875, + "eval_loss": 0.359036922454834, + "eval_rewards/accuracies": 0.8619999885559082, + "eval_rewards/chosen": 0.4738897383213043, + "eval_rewards/margins": 7.059021949768066, + "eval_rewards/rejected": -6.585132122039795, + "eval_runtime": 277.0222, + "eval_samples_per_second": 7.22, + "eval_steps_per_second": 0.451, "step": 2900 }, { "epoch": 1.5, "learning_rate": 2.773952954675846e-07, - "logits/chosen": -2.932246446609497, - "logits/rejected": -2.9477548599243164, - "logps/chosen": -220.763916015625, - "logps/rejected": -286.6198425292969, - "loss": 0.1087, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 1.352697730064392, - "rewards/margins": 6.958559989929199, - "rewards/rejected": -5.605862140655518, + "logits/chosen": -2.794093608856201, + "logits/rejected": -2.8225858211517334, + "logps/chosen": -217.7832489013672, + "logps/rejected": -260.6131286621094, + "loss": 0.115, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6524279117584229, + "rewards/margins": 9.465045928955078, + "rewards/rejected": -7.812617301940918, "step": 2910 }, { "epoch": 1.51, "learning_rate": 2.764390896921017e-07, - "logits/chosen": -2.934304714202881, - "logits/rejected": -2.936347723007202, - "logps/chosen": -256.7560729980469, - "logps/rejected": -320.73223876953125, - "loss": 0.117, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7255386114120483, - "rewards/margins": 7.111201286315918, - "rewards/rejected": -6.38566255569458, + "logits/chosen": -2.8061718940734863, + "logits/rejected": -2.763732433319092, + "logps/chosen": -252.5445556640625, + "logps/rejected": -251.26034545898438, + "loss": 0.0901, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.1475918292999268, + "rewards/margins": 8.924840927124023, + "rewards/rejected": -7.777248382568359, "step": 2920 }, { "epoch": 1.51, "learning_rate": 2.754828839166188e-07, - "logits/chosen": -2.8261325359344482, - "logits/rejected": -2.9312081336975098, - "logps/chosen": -233.1983642578125, - "logps/rejected": -311.11590576171875, - "loss": 0.0778, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.7787030935287476, - "rewards/margins": 9.04845142364502, - "rewards/rejected": -7.269747734069824, + "logits/chosen": -2.68436336517334, + "logits/rejected": -2.760284900665283, + "logps/chosen": -233.3820037841797, + "logps/rejected": -307.3991394042969, + "loss": 0.1083, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.7603378295898438, + "rewards/margins": 10.544532775878906, + "rewards/rejected": -8.784194946289062, "step": 2930 }, { "epoch": 1.52, "learning_rate": 2.7452667814113594e-07, - "logits/chosen": -2.989229202270508, - "logits/rejected": -2.977133274078369, - "logps/chosen": -293.3673400878906, - "logps/rejected": -334.6380310058594, - "loss": 0.0789, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3550571203231812, - "rewards/margins": 6.886478424072266, - "rewards/rejected": -5.531420707702637, + "logits/chosen": -2.8724405765533447, + "logits/rejected": -2.858412265777588, + "logps/chosen": -295.5524597167969, + "logps/rejected": -297.13909912109375, + "loss": 0.0561, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1383148431777954, + "rewards/margins": 9.56761646270752, + "rewards/rejected": -8.429302215576172, "step": 2940 }, { "epoch": 1.52, "learning_rate": 2.7357047236565306e-07, - "logits/chosen": -2.8399806022644043, - "logits/rejected": -2.9234249591827393, - "logps/chosen": -234.34567260742188, - "logps/rejected": -336.1915588378906, - "loss": 0.0704, + "logits/chosen": -2.722064256668091, + "logits/rejected": -2.7667765617370605, + "logps/chosen": -227.0841827392578, + "logps/rejected": -290.9980163574219, + "loss": 0.0591, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.254919171333313, - "rewards/margins": 7.959687232971191, - "rewards/rejected": -6.704766750335693, + "rewards/chosen": 1.9815807342529297, + "rewards/margins": 10.175790786743164, + "rewards/rejected": -8.19421100616455, "step": 2950 }, { "epoch": 1.53, "learning_rate": 2.7261426659017017e-07, - "logits/chosen": -2.92905855178833, - "logits/rejected": -2.957266330718994, - "logps/chosen": -282.38031005859375, - "logps/rejected": -350.7286682128906, - "loss": 0.1406, + "logits/chosen": -2.7820794582366943, + "logits/rejected": -2.782778024673462, + "logps/chosen": -274.3792419433594, + "logps/rejected": -311.2986145019531, + "loss": 0.1068, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6080772280693054, - "rewards/margins": 7.091387748718262, - "rewards/rejected": -6.483310699462891, + "rewards/chosen": 1.4096379280090332, + "rewards/margins": 9.630009651184082, + "rewards/rejected": -8.22037124633789, "step": 2960 }, { "epoch": 1.53, "learning_rate": 2.716580608146873e-07, - "logits/chosen": -2.959866762161255, - "logits/rejected": -2.978870391845703, - "logps/chosen": -312.2777404785156, - "logps/rejected": -347.9805603027344, - "loss": 0.0691, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.632709264755249, - "rewards/margins": 8.156556129455566, - "rewards/rejected": -6.523846626281738, + "logits/chosen": -2.794208526611328, + "logits/rejected": -2.7676689624786377, + "logps/chosen": -318.16693115234375, + "logps/rejected": -287.7646789550781, + "loss": 0.0775, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7113300561904907, + "rewards/margins": 11.331686973571777, + "rewards/rejected": -9.620355606079102, "step": 2970 }, { "epoch": 1.54, "learning_rate": 2.7070185503920446e-07, - "logits/chosen": -2.7604541778564453, - "logits/rejected": -2.850285053253174, - "logps/chosen": -251.9482421875, - "logps/rejected": -328.6760559082031, - "loss": 0.0711, + "logits/chosen": -2.564415693283081, + "logits/rejected": -2.6571946144104004, + "logps/chosen": -256.3440246582031, + "logps/rejected": -323.9332580566406, + "loss": 0.0505, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.5054420232772827, - "rewards/margins": 7.880296230316162, - "rewards/rejected": -6.3748555183410645, + "rewards/chosen": 1.065863013267517, + "rewards/margins": 9.591228485107422, + "rewards/rejected": -8.525365829467773, "step": 2980 }, { "epoch": 1.54, "learning_rate": 2.6974564926372157e-07, - "logits/chosen": -2.944751739501953, - "logits/rejected": -2.9440500736236572, - "logps/chosen": -243.61813354492188, - "logps/rejected": -359.72650146484375, - "loss": 0.0765, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.8091905117034912, - "rewards/margins": 8.164262771606445, - "rewards/rejected": -6.355072975158691, + "logits/chosen": -2.7803008556365967, + "logits/rejected": -2.75093412399292, + "logps/chosen": -245.029052734375, + "logps/rejected": -312.7168884277344, + "loss": 0.0434, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6671940088272095, + "rewards/margins": 10.378290176391602, + "rewards/rejected": -8.711095809936523, "step": 2990 }, { "epoch": 1.55, "learning_rate": 2.687894434882387e-07, - "logits/chosen": -2.9194934368133545, - "logits/rejected": -2.926443099975586, - "logps/chosen": -212.02090454101562, - "logps/rejected": -273.5159912109375, - "loss": 0.2113, + "logits/chosen": -2.7880043983459473, + "logits/rejected": -2.7584502696990967, + "logps/chosen": -209.9309844970703, + "logps/rejected": -240.32113647460938, + "loss": 0.0534, "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5424438714981079, - "rewards/margins": 5.714383602142334, - "rewards/rejected": -5.171939373016357, + "rewards/chosen": 0.7528551816940308, + "rewards/margins": 8.421224594116211, + "rewards/rejected": -7.668370723724365, "step": 3000 }, { "epoch": 1.55, - "eval_logits/chosen": -2.843538999557495, - "eval_logits/rejected": -2.8977818489074707, - "eval_logps/chosen": -248.24984741210938, - "eval_logps/rejected": -315.9092712402344, - "eval_loss": 0.470895916223526, - "eval_rewards/accuracies": 0.8080000281333923, - "eval_rewards/chosen": -0.056969307363033295, - "eval_rewards/margins": 3.920220375061035, - "eval_rewards/rejected": -3.9771900177001953, - "eval_runtime": 278.5009, - "eval_samples_per_second": 7.181, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.718498706817627, + "eval_logits/rejected": -2.751434087753296, + "eval_logps/chosen": -245.082275390625, + "eval_logps/rejected": -291.26690673828125, + "eval_loss": 0.33173465728759766, + "eval_rewards/accuracies": 0.8640000224113464, + "eval_rewards/chosen": 0.2732241749763489, + "eval_rewards/margins": 6.826529502868652, + "eval_rewards/rejected": -6.553304672241211, + "eval_runtime": 276.1801, + "eval_samples_per_second": 7.242, + "eval_steps_per_second": 0.453, "step": 3000 }, { "epoch": 1.55, "learning_rate": 2.678332377127558e-07, - "logits/chosen": -2.9059603214263916, - "logits/rejected": -2.879521369934082, - "logps/chosen": -255.6497039794922, - "logps/rejected": -346.8174743652344, - "loss": 0.0754, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.9771935939788818, - "rewards/margins": 9.132074356079102, - "rewards/rejected": -7.154881477355957, + "logits/chosen": -2.786602020263672, + "logits/rejected": -2.764167308807373, + "logps/chosen": -250.4424591064453, + "logps/rejected": -337.5587158203125, + "loss": 0.055, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.4979145526885986, + "rewards/margins": 12.30707836151123, + "rewards/rejected": -9.809165000915527, "step": 3010 }, { "epoch": 1.56, "learning_rate": 2.668770319372729e-07, - "logits/chosen": -2.8338193893432617, - "logits/rejected": -2.904914140701294, - "logps/chosen": -186.17552185058594, - "logps/rejected": -281.7873229980469, - "loss": 0.0663, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.6421797275543213, - "rewards/margins": 7.126562595367432, - "rewards/rejected": -5.484382629394531, + "logits/chosen": -2.7259132862091064, + "logits/rejected": -2.8007891178131104, + "logps/chosen": -186.4517364501953, + "logps/rejected": -266.7397766113281, + "loss": 0.0452, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.6136891841888428, + "rewards/margins": 9.868091583251953, + "rewards/rejected": -8.254402160644531, "step": 3020 }, { "epoch": 1.56, "learning_rate": 2.6592082616179004e-07, - "logits/chosen": -2.760566234588623, - "logits/rejected": -2.785473585128784, - "logps/chosen": -195.1143341064453, - "logps/rejected": -304.75128173828125, - "loss": 0.0727, + "logits/chosen": -2.6599555015563965, + "logits/rejected": -2.628469944000244, + "logps/chosen": -192.6136474609375, + "logps/rejected": -246.0712127685547, + "loss": 0.0723, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.42879682779312134, - "rewards/margins": 6.535356044769287, - "rewards/rejected": -6.1065592765808105, + "rewards/chosen": 0.6181789636611938, + "rewards/margins": 9.498112678527832, + "rewards/rejected": -8.87993335723877, "step": 3030 }, { "epoch": 1.57, "learning_rate": 2.649646203863071e-07, - "logits/chosen": -2.8217978477478027, - "logits/rejected": -2.904336929321289, - "logps/chosen": -247.8555450439453, - "logps/rejected": -356.73992919921875, - "loss": 0.1016, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.409906029701233, - "rewards/margins": 7.027436256408691, - "rewards/rejected": -5.617530345916748, + "logits/chosen": -2.7082459926605225, + "logits/rejected": -2.7981760501861572, + "logps/chosen": -248.25634765625, + "logps/rejected": -334.35540771484375, + "loss": 0.0693, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.370101809501648, + "rewards/margins": 9.701082229614258, + "rewards/rejected": -8.33098030090332, "step": 3040 }, { "epoch": 1.57, "learning_rate": 2.640084146108242e-07, - "logits/chosen": -2.8797459602355957, - "logits/rejected": -2.927067279815674, - "logps/chosen": -301.1026306152344, - "logps/rejected": -334.0341491699219, - "loss": 0.0747, + "logits/chosen": -2.811915159225464, + "logits/rejected": -2.8289389610290527, + "logps/chosen": -298.9040832519531, + "logps/rejected": -294.5615234375, + "loss": 0.0721, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.892419159412384, - "rewards/margins": 7.652726650238037, - "rewards/rejected": -6.760307312011719, + "rewards/chosen": 1.1836546659469604, + "rewards/margins": 10.885623931884766, + "rewards/rejected": -9.701969146728516, "step": 3050 }, { "epoch": 1.58, "learning_rate": 2.6305220883534133e-07, - "logits/chosen": -2.8873353004455566, - "logits/rejected": -2.894935369491577, - "logps/chosen": -186.02676391601562, - "logps/rejected": -322.01812744140625, - "loss": 0.082, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.6815090775489807, - "rewards/margins": 7.164406776428223, - "rewards/rejected": -6.4828972816467285, + "logits/chosen": -2.815936803817749, + "logits/rejected": -2.8219399452209473, + "logps/chosen": -181.8057861328125, + "logps/rejected": -305.96478271484375, + "loss": 0.0694, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1036055088043213, + "rewards/margins": 10.693105697631836, + "rewards/rejected": -9.589500427246094, "step": 3060 }, { "epoch": 1.58, "learning_rate": 2.6209600305985845e-07, - "logits/chosen": -2.846514940261841, - "logits/rejected": -2.900930166244507, - "logps/chosen": -227.4207305908203, - "logps/rejected": -353.51153564453125, - "loss": 0.1415, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.1509945392608643, - "rewards/margins": 8.626825332641602, - "rewards/rejected": -7.475831031799316, + "logits/chosen": -2.797356367111206, + "logits/rejected": -2.844510316848755, + "logps/chosen": -229.5518798828125, + "logps/rejected": -302.3344421386719, + "loss": 0.0785, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.228035807609558, + "rewards/margins": 10.740432739257812, + "rewards/rejected": -9.512395858764648, "step": 3070 }, { "epoch": 1.59, "learning_rate": 2.6113979728437556e-07, - "logits/chosen": -2.9146270751953125, - "logits/rejected": -2.9426653385162354, - "logps/chosen": -249.2900848388672, - "logps/rejected": -329.2434387207031, - "loss": 0.0853, + "logits/chosen": -2.859743356704712, + "logits/rejected": -2.8888421058654785, + "logps/chosen": -250.04336547851562, + "logps/rejected": -301.0086975097656, + "loss": 0.15, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.4317848682403564, - "rewards/margins": 7.4246826171875, - "rewards/rejected": -5.992897033691406, + "rewards/chosen": 1.3564578294754028, + "rewards/margins": 9.660563468933105, + "rewards/rejected": -8.304105758666992, "step": 3080 }, { "epoch": 1.6, "learning_rate": 2.601835915088927e-07, - "logits/chosen": -2.760486125946045, - "logits/rejected": -2.8186051845550537, - "logps/chosen": -211.21728515625, - "logps/rejected": -329.289794921875, - "loss": 0.1681, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.4863201677799225, - "rewards/margins": 6.338644027709961, - "rewards/rejected": -5.85232400894165, + "logits/chosen": -2.730300188064575, + "logits/rejected": -2.7773311138153076, + "logps/chosen": -211.5944366455078, + "logps/rejected": -303.1237487792969, + "loss": 0.0867, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4511043131351471, + "rewards/margins": 9.371929168701172, + "rewards/rejected": -8.92082405090332, "step": 3090 }, { "epoch": 1.6, "learning_rate": 2.592273857334098e-07, - "logits/chosen": -2.8253026008605957, - "logits/rejected": -2.86600661277771, - "logps/chosen": -195.7429656982422, - "logps/rejected": -351.58770751953125, - "loss": 0.1858, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2111473083496094, - "rewards/margins": 8.60848331451416, - "rewards/rejected": -7.397336006164551, + "logits/chosen": -2.8162806034088135, + "logits/rejected": -2.871568202972412, + "logps/chosen": -192.22019958496094, + "logps/rejected": -313.6377258300781, + "loss": 0.0552, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5751912593841553, + "rewards/margins": 10.191747665405273, + "rewards/rejected": -8.616556167602539, "step": 3100 }, { "epoch": 1.6, - "eval_logits/chosen": -2.849841833114624, - "eval_logits/rejected": -2.9042794704437256, - "eval_logps/chosen": -249.63946533203125, - "eval_logps/rejected": -318.3751220703125, - "eval_loss": 0.47693005204200745, - "eval_rewards/accuracies": 0.7960000038146973, - "eval_rewards/chosen": -0.19593170285224915, - "eval_rewards/margins": 4.027840614318848, - "eval_rewards/rejected": -4.2237725257873535, - "eval_runtime": 278.9329, - "eval_samples_per_second": 7.17, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.8457260131835938, + "eval_logits/rejected": -2.8895390033721924, + "eval_logps/chosen": -245.59381103515625, + "eval_logps/rejected": -294.64349365234375, + "eval_loss": 0.3434513807296753, + "eval_rewards/accuracies": 0.8679999709129333, + "eval_rewards/chosen": 0.22206954658031464, + "eval_rewards/margins": 7.113034248352051, + "eval_rewards/rejected": -6.890964984893799, + "eval_runtime": 276.3315, + "eval_samples_per_second": 7.238, + "eval_steps_per_second": 0.452, "step": 3100 }, { "epoch": 1.61, "learning_rate": 2.582711799579269e-07, - "logits/chosen": -2.9083123207092285, - "logits/rejected": -2.9558422565460205, - "logps/chosen": -283.11962890625, - "logps/rejected": -353.2391052246094, - "loss": 0.1054, + "logits/chosen": -2.8935742378234863, + "logits/rejected": -2.919924259185791, + "logps/chosen": -283.63726806640625, + "logps/rejected": -309.16522216796875, + "loss": 0.1491, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.2175567150115967, - "rewards/margins": 7.344391822814941, - "rewards/rejected": -6.126835346221924, + "rewards/chosen": 1.1673766374588013, + "rewards/margins": 10.357894897460938, + "rewards/rejected": -9.190518379211426, "step": 3110 }, { "epoch": 1.61, "learning_rate": 2.573149741824441e-07, - "logits/chosen": -2.9085631370544434, - "logits/rejected": -2.9231581687927246, - "logps/chosen": -255.6195831298828, - "logps/rejected": -324.60516357421875, - "loss": 0.1551, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3148778676986694, - "rewards/margins": 7.363683223724365, - "rewards/rejected": -6.048805236816406, + "logits/chosen": -2.8520469665527344, + "logits/rejected": -2.889009952545166, + "logps/chosen": -261.79998779296875, + "logps/rejected": -314.26458740234375, + "loss": 0.0958, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0541094541549683, + "rewards/margins": 9.267090797424316, + "rewards/rejected": -8.212981224060059, "step": 3120 }, { "epoch": 1.62, "learning_rate": 2.563587684069612e-07, - "logits/chosen": -2.9189603328704834, - "logits/rejected": -2.9584662914276123, - "logps/chosen": -279.16839599609375, - "logps/rejected": -339.174072265625, - "loss": 0.0702, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0117626190185547, - "rewards/margins": 7.726855278015137, - "rewards/rejected": -6.715092658996582, + "logits/chosen": -2.877312183380127, + "logits/rejected": -2.8885257244110107, + "logps/chosen": -279.4521484375, + "logps/rejected": -320.37567138671875, + "loss": 0.052, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9823194742202759, + "rewards/margins": 10.346516609191895, + "rewards/rejected": -9.364197731018066, "step": 3130 }, { "epoch": 1.62, "learning_rate": 2.554025626314783e-07, - "logits/chosen": -2.9455857276916504, - "logits/rejected": -2.9266839027404785, - "logps/chosen": -238.67636108398438, - "logps/rejected": -348.72894287109375, - "loss": 0.1138, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.6194132566452026, - "rewards/margins": 8.241477966308594, - "rewards/rejected": -7.62206506729126, + "logits/chosen": -2.889101505279541, + "logits/rejected": -2.870471954345703, + "logps/chosen": -239.0482635498047, + "logps/rejected": -315.1771240234375, + "loss": 0.0798, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5822251439094543, + "rewards/margins": 10.573446273803711, + "rewards/rejected": -9.991220474243164, "step": 3140 }, { "epoch": 1.63, "learning_rate": 2.544463568559954e-07, - "logits/chosen": -2.9469656944274902, - "logits/rejected": -2.9347240924835205, - "logps/chosen": -267.73394775390625, - "logps/rejected": -353.4038391113281, - "loss": 0.0658, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.103703498840332, - "rewards/margins": 8.258222579956055, - "rewards/rejected": -7.154518127441406, + "logits/chosen": -2.8907864093780518, + "logits/rejected": -2.8877110481262207, + "logps/chosen": -268.4491271972656, + "logps/rejected": -299.43084716796875, + "loss": 0.0551, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0321850776672363, + "rewards/margins": 9.17643928527832, + "rewards/rejected": -8.144253730773926, "step": 3150 }, { "epoch": 1.63, "learning_rate": 2.5349015108051254e-07, - "logits/chosen": -2.8953492641448975, - "logits/rejected": -2.9293885231018066, - "logps/chosen": -231.06106567382812, - "logps/rejected": -328.3216552734375, - "loss": 0.0572, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.0628279447555542, - "rewards/margins": 8.15467643737793, - "rewards/rejected": -7.091848850250244, + "logits/chosen": -2.8483214378356934, + "logits/rejected": -2.8745040893554688, + "logps/chosen": -235.03439331054688, + "logps/rejected": -320.50341796875, + "loss": 0.0478, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.077526330947876, + "rewards/margins": 9.315979957580566, + "rewards/rejected": -8.23845386505127, "step": 3160 }, { "epoch": 1.64, "learning_rate": 2.5253394530502966e-07, - "logits/chosen": -2.8384034633636475, - "logits/rejected": -2.867201089859009, - "logps/chosen": -332.78564453125, - "logps/rejected": -383.9205322265625, - "loss": 0.0625, + "logits/chosen": -2.800015926361084, + "logits/rejected": -2.8190784454345703, + "logps/chosen": -337.3632507324219, + "logps/rejected": -304.2581481933594, + "loss": 0.0826, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.7102988958358765, - "rewards/margins": 7.870628356933594, - "rewards/rejected": -7.160330295562744, + "rewards/chosen": 0.5777830481529236, + "rewards/margins": 10.045036315917969, + "rewards/rejected": -9.467252731323242, "step": 3170 }, { "epoch": 1.64, "learning_rate": 2.5157773952954677e-07, - "logits/chosen": -2.939607620239258, - "logits/rejected": -2.9498610496520996, - "logps/chosen": -307.7158203125, - "logps/rejected": -341.71466064453125, - "loss": 0.071, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.9481692314147949, - "rewards/margins": 7.467158317565918, - "rewards/rejected": -6.518989562988281, + "logits/chosen": -2.8541150093078613, + "logits/rejected": -2.883697032928467, + "logps/chosen": -305.7124328613281, + "logps/rejected": -339.59527587890625, + "loss": 0.0466, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1485055685043335, + "rewards/margins": 9.632898330688477, + "rewards/rejected": -8.484393119812012, "step": 3180 }, { "epoch": 1.65, "learning_rate": 2.506215337540639e-07, - "logits/chosen": -2.8836703300476074, - "logits/rejected": -2.895805835723877, - "logps/chosen": -208.77883911132812, - "logps/rejected": -361.16754150390625, - "loss": 0.1088, + "logits/chosen": -2.793628215789795, + "logits/rejected": -2.800882339477539, + "logps/chosen": -211.18911743164062, + "logps/rejected": -315.6861877441406, + "loss": 0.1104, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.1989336013793945, - "rewards/margins": 7.9919114112854, - "rewards/rejected": -6.792977809906006, + "rewards/chosen": 0.9637197256088257, + "rewards/margins": 10.711942672729492, + "rewards/rejected": -9.748224258422852, "step": 3190 }, { "epoch": 1.65, "learning_rate": 2.4966532797858095e-07, - "logits/chosen": -2.8527767658233643, - "logits/rejected": -2.8542492389678955, - "logps/chosen": -248.8079376220703, - "logps/rejected": -321.9994201660156, - "loss": 0.095, + "logits/chosen": -2.8031210899353027, + "logits/rejected": -2.776792049407959, + "logps/chosen": -252.0576171875, + "logps/rejected": -293.24114990234375, + "loss": 0.0561, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9598841667175293, - "rewards/margins": 7.442416191101074, - "rewards/rejected": -6.482532501220703, + "rewards/chosen": 1.3012080192565918, + "rewards/margins": 10.132192611694336, + "rewards/rejected": -8.830984115600586, "step": 3200 }, { "epoch": 1.65, - "eval_logits/chosen": -2.8687751293182373, - "eval_logits/rejected": -2.928819417953491, - "eval_logps/chosen": -250.7627410888672, - "eval_logps/rejected": -319.1705017089844, - "eval_loss": 0.4938603341579437, - "eval_rewards/accuracies": 0.8119999766349792, - "eval_rewards/chosen": -0.3082582652568817, - "eval_rewards/margins": 3.9950480461120605, - "eval_rewards/rejected": -4.303306579589844, - "eval_runtime": 278.8397, - "eval_samples_per_second": 7.173, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.82877779006958, + "eval_logits/rejected": -2.8712751865386963, + "eval_logps/chosen": -244.92066955566406, + "eval_logps/rejected": -293.2156066894531, + "eval_loss": 0.32492488622665405, + "eval_rewards/accuracies": 0.8619999885559082, + "eval_rewards/chosen": 0.2893838584423065, + "eval_rewards/margins": 7.037559509277344, + "eval_rewards/rejected": -6.748175621032715, + "eval_runtime": 276.5351, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 0.452, "step": 3200 }, { "epoch": 1.66, "learning_rate": 2.4870912220309807e-07, - "logits/chosen": -2.84702467918396, - "logits/rejected": -2.915963649749756, - "logps/chosen": -257.31219482421875, - "logps/rejected": -324.8391418457031, - "loss": 0.0766, + "logits/chosen": -2.7796378135681152, + "logits/rejected": -2.8680028915405273, + "logps/chosen": -255.15133666992188, + "logps/rejected": -287.7635498046875, + "loss": 0.0563, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.4782533645629883, - "rewards/margins": 7.525382995605469, - "rewards/rejected": -6.0471296310424805, + "rewards/chosen": 1.693913459777832, + "rewards/margins": 9.529541969299316, + "rewards/rejected": -7.835629463195801, "step": 3210 }, { "epoch": 1.66, "learning_rate": 2.477529164276152e-07, - "logits/chosen": -2.9667117595672607, - "logits/rejected": -2.967026710510254, - "logps/chosen": -233.61123657226562, - "logps/rejected": -337.34967041015625, - "loss": 0.28, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9073492288589478, - "rewards/margins": 6.835887908935547, - "rewards/rejected": -5.9285383224487305, + "logits/chosen": -2.933002471923828, + "logits/rejected": -2.9256365299224854, + "logps/chosen": -229.5218048095703, + "logps/rejected": -301.123046875, + "loss": 0.0577, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.599159598350525, + "rewards/margins": 10.19759464263916, + "rewards/rejected": -8.598433494567871, "step": 3220 }, { "epoch": 1.67, "learning_rate": 2.4679671065213235e-07, - "logits/chosen": -2.899747133255005, - "logits/rejected": -2.9681437015533447, - "logps/chosen": -247.35348510742188, - "logps/rejected": -374.6236572265625, - "loss": 0.0942, + "logits/chosen": -2.8897438049316406, + "logits/rejected": -2.9376511573791504, + "logps/chosen": -240.6859130859375, + "logps/rejected": -364.0736083984375, + "loss": 0.083, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.231717824935913, - "rewards/margins": 7.991697788238525, - "rewards/rejected": -6.759980201721191, + "rewards/chosen": 1.8984752893447876, + "rewards/margins": 9.967618942260742, + "rewards/rejected": -8.06914234161377, "step": 3230 }, { "epoch": 1.67, "learning_rate": 2.4584050487664947e-07, - "logits/chosen": -2.859067440032959, - "logits/rejected": -2.8838469982147217, - "logps/chosen": -246.4993438720703, - "logps/rejected": -319.4035339355469, - "loss": 0.1055, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.8468250036239624, - "rewards/margins": 8.415315628051758, - "rewards/rejected": -6.568489074707031, + "logits/chosen": -2.8327298164367676, + "logits/rejected": -2.8426101207733154, + "logps/chosen": -246.0293426513672, + "logps/rejected": -286.42950439453125, + "loss": 0.071, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8943653106689453, + "rewards/margins": 10.964719772338867, + "rewards/rejected": -9.070355415344238, "step": 3240 }, { "epoch": 1.68, "learning_rate": 2.448842991011666e-07, - "logits/chosen": -2.8728671073913574, - "logits/rejected": -2.8752236366271973, - "logps/chosen": -225.12265014648438, - "logps/rejected": -329.4236755371094, - "loss": 0.1032, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2581980228424072, - "rewards/margins": 7.4553961753845215, - "rewards/rejected": -6.197198390960693, + "logits/chosen": -2.852550983428955, + "logits/rejected": -2.854254961013794, + "logps/chosen": -224.29287719726562, + "logps/rejected": -308.294189453125, + "loss": 0.0463, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3399924039840698, + "rewards/margins": 9.836767196655273, + "rewards/rejected": -8.496774673461914, "step": 3250 }, { "epoch": 1.68, "learning_rate": 2.439280933256837e-07, - "logits/chosen": -2.853830099105835, - "logits/rejected": -2.8993842601776123, - "logps/chosen": -206.5619659423828, - "logps/rejected": -349.40966796875, - "loss": 0.0579, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0377991199493408, - "rewards/margins": 7.783722877502441, - "rewards/rejected": -6.7459235191345215, + "logits/chosen": -2.832066774368286, + "logits/rejected": -2.874682903289795, + "logps/chosen": -205.1912078857422, + "logps/rejected": -332.5052795410156, + "loss": 0.0323, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4458096027374268, + "rewards/margins": 11.231256484985352, + "rewards/rejected": -9.785446166992188, "step": 3260 }, { "epoch": 1.69, "learning_rate": 2.429718875502008e-07, - "logits/chosen": -2.723466634750366, - "logits/rejected": -2.810455799102783, - "logps/chosen": -234.6430206298828, - "logps/rejected": -392.9654541015625, - "loss": 0.0802, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.2275792360305786, - "rewards/margins": 8.184759140014648, - "rewards/rejected": -6.957180023193359, + "logits/chosen": -2.6971449851989746, + "logits/rejected": -2.7993435859680176, + "logps/chosen": -233.2151641845703, + "logps/rejected": -361.5522155761719, + "loss": 0.0441, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.3703675270080566, + "rewards/margins": 9.962080955505371, + "rewards/rejected": -8.591713905334473, "step": 3270 }, { "epoch": 1.69, "learning_rate": 2.420156817747179e-07, - "logits/chosen": -2.819467067718506, - "logits/rejected": -2.7857964038848877, - "logps/chosen": -255.324951171875, - "logps/rejected": -285.57769775390625, - "loss": 0.0542, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.5177501440048218, - "rewards/margins": 7.244959831237793, - "rewards/rejected": -5.727210521697998, + "logits/chosen": -2.7944576740264893, + "logits/rejected": -2.7598938941955566, + "logps/chosen": -253.043212890625, + "logps/rejected": -284.27850341796875, + "loss": 0.0329, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.745924711227417, + "rewards/margins": 10.65503215789795, + "rewards/rejected": -8.909107208251953, "step": 3280 }, { "epoch": 1.7, "learning_rate": 2.41059475999235e-07, - "logits/chosen": -2.8438751697540283, - "logits/rejected": -2.8182473182678223, - "logps/chosen": -228.69528198242188, - "logps/rejected": -317.17242431640625, - "loss": 0.1373, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.5363596677780151, - "rewards/margins": 6.843097686767578, - "rewards/rejected": -6.306737899780273, + "logits/chosen": -2.8297312259674072, + "logits/rejected": -2.758760929107666, + "logps/chosen": -228.59701538085938, + "logps/rejected": -255.7022705078125, + "loss": 0.1259, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6624184846878052, + "rewards/margins": 8.802133560180664, + "rewards/rejected": -8.139715194702148, "step": 3290 }, { "epoch": 1.7, "learning_rate": 2.4010327022375216e-07, - "logits/chosen": -2.859078884124756, - "logits/rejected": -2.923827648162842, - "logps/chosen": -244.914794921875, - "logps/rejected": -357.98052978515625, - "loss": 0.1147, + "logits/chosen": -2.837235927581787, + "logits/rejected": -2.887120008468628, + "logps/chosen": -242.4871826171875, + "logps/rejected": -347.31182861328125, + "loss": 0.0898, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0887374877929688, - "rewards/margins": 7.975939750671387, - "rewards/rejected": -6.887202262878418, + "rewards/chosen": 1.3313019275665283, + "rewards/margins": 10.235647201538086, + "rewards/rejected": -8.90434455871582, "step": 3300 }, { "epoch": 1.7, - "eval_logits/chosen": -2.848367214202881, - "eval_logits/rejected": -2.91117787361145, - "eval_logps/chosen": -252.27926635742188, - "eval_logps/rejected": -323.21832275390625, - "eval_loss": 0.48966941237449646, - "eval_rewards/accuracies": 0.8080000281333923, - "eval_rewards/chosen": -0.4599113464355469, - "eval_rewards/margins": 4.2481770515441895, - "eval_rewards/rejected": -4.708088397979736, - "eval_runtime": 278.765, - "eval_samples_per_second": 7.175, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.824777841567993, + "eval_logits/rejected": -2.8631463050842285, + "eval_logps/chosen": -247.55923461914062, + "eval_logps/rejected": -298.371337890625, + "eval_loss": 0.33954623341560364, + "eval_rewards/accuracies": 0.8600000143051147, + "eval_rewards/chosen": 0.0255246851593256, + "eval_rewards/margins": 7.289275169372559, + "eval_rewards/rejected": -7.2637505531311035, + "eval_runtime": 276.5499, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 0.452, "step": 3300 }, { "epoch": 1.71, "learning_rate": 2.391470644482693e-07, - "logits/chosen": -2.8616204261779785, - "logits/rejected": -2.943343162536621, - "logps/chosen": -249.031494140625, - "logps/rejected": -359.6810607910156, - "loss": 0.1186, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.4715783596038818, - "rewards/margins": 8.211606979370117, - "rewards/rejected": -6.740028381347656, + "logits/chosen": -2.84897780418396, + "logits/rejected": -2.912201404571533, + "logps/chosen": -246.42855834960938, + "logps/rejected": -297.96405029296875, + "loss": 0.0435, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.7292728424072266, + "rewards/margins": 10.75324821472168, + "rewards/rejected": -9.023974418640137, "step": 3310 }, { "epoch": 1.71, "learning_rate": 2.3819085867278636e-07, - "logits/chosen": -2.801548480987549, - "logits/rejected": -2.8232216835021973, - "logps/chosen": -170.48995971679688, - "logps/rejected": -271.6853942871094, - "loss": 0.0903, + "logits/chosen": -2.8075404167175293, + "logits/rejected": -2.815871477127075, + "logps/chosen": -170.3408203125, + "logps/rejected": -253.8338623046875, + "loss": 0.0533, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.2638075053691864, - "rewards/margins": 6.040502548217773, - "rewards/rejected": -5.7766947746276855, + "rewards/chosen": 0.27967214584350586, + "rewards/margins": 8.253989219665527, + "rewards/rejected": -7.974316596984863, "step": 3320 }, { "epoch": 1.72, "learning_rate": 2.3723465289730348e-07, - "logits/chosen": -2.888091564178467, - "logits/rejected": -2.9160046577453613, - "logps/chosen": -257.8720703125, - "logps/rejected": -289.5814208984375, - "loss": 0.0955, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3228559494018555, - "rewards/margins": 6.6165056228637695, - "rewards/rejected": -5.293649673461914, + "logits/chosen": -2.9047446250915527, + "logits/rejected": -2.930854082107544, + "logps/chosen": -254.4055938720703, + "logps/rejected": -290.2741394042969, + "loss": 0.1226, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.6695034503936768, + "rewards/margins": 8.73763656616211, + "rewards/rejected": -7.0681328773498535, "step": 3330 }, { "epoch": 1.72, "learning_rate": 2.362784471218206e-07, - "logits/chosen": -2.8482556343078613, - "logits/rejected": -2.866028308868408, - "logps/chosen": -247.8570098876953, - "logps/rejected": -353.5355224609375, - "loss": 0.0629, + "logits/chosen": -2.8588271141052246, + "logits/rejected": -2.8775086402893066, + "logps/chosen": -245.676025390625, + "logps/rejected": -335.73736572265625, + "loss": 0.0774, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.318343162536621, - "rewards/margins": 7.900382041931152, - "rewards/rejected": -6.582038879394531, + "rewards/chosen": 1.5354220867156982, + "rewards/margins": 9.892122268676758, + "rewards/rejected": -8.35669994354248, "step": 3340 }, { "epoch": 1.73, "learning_rate": 2.353222413463377e-07, - "logits/chosen": -2.8011178970336914, - "logits/rejected": -2.826658248901367, - "logps/chosen": -213.5056610107422, - "logps/rejected": -359.37542724609375, - "loss": 0.0429, + "logits/chosen": -2.816009759902954, + "logits/rejected": -2.800391912460327, + "logps/chosen": -210.69827270507812, + "logps/rejected": -326.49615478515625, + "loss": 0.0255, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0736242532730103, - "rewards/margins": 7.892706394195557, - "rewards/rejected": -6.819081783294678, + "rewards/chosen": 1.3558534383773804, + "rewards/margins": 10.525764465332031, + "rewards/rejected": -9.16991138458252, "step": 3350 }, { "epoch": 1.73, "learning_rate": 2.3436603557085483e-07, - "logits/chosen": -2.8530900478363037, - "logits/rejected": -2.8854198455810547, - "logps/chosen": -218.562255859375, - "logps/rejected": -370.8888244628906, - "loss": 0.0755, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.5468239784240723, - "rewards/margins": 8.230497360229492, - "rewards/rejected": -6.683673858642578, + "logits/chosen": -2.8837997913360596, + "logits/rejected": -2.9029414653778076, + "logps/chosen": -222.11074829101562, + "logps/rejected": -354.0726318359375, + "loss": 0.0353, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5860627889633179, + "rewards/margins": 10.358034133911133, + "rewards/rejected": -8.771970748901367, "step": 3360 }, { "epoch": 1.74, "learning_rate": 2.3340982979537197e-07, - "logits/chosen": -2.828904390335083, - "logits/rejected": -2.8650832176208496, - "logps/chosen": -292.4428405761719, - "logps/rejected": -395.83441162109375, - "loss": 0.1209, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 2.142470359802246, - "rewards/margins": 8.96367073059082, - "rewards/rejected": -6.821200370788574, + "logits/chosen": -2.863508462905884, + "logits/rejected": -2.8701565265655518, + "logps/chosen": -295.1399841308594, + "logps/rejected": -336.8844909667969, + "loss": 0.0599, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.870693564414978, + "rewards/margins": 11.696889877319336, + "rewards/rejected": -9.826196670532227, "step": 3370 }, { "epoch": 1.74, "learning_rate": 2.3245362401988909e-07, - "logits/chosen": -2.8313663005828857, - "logits/rejected": -2.8785340785980225, - "logps/chosen": -261.6875, - "logps/rejected": -338.6850280761719, - "loss": 0.1164, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.722820520401001, - "rewards/margins": 8.832094192504883, - "rewards/rejected": -7.109274387359619, + "logits/chosen": -2.865304470062256, + "logits/rejected": -2.8876614570617676, + "logps/chosen": -257.00946044921875, + "logps/rejected": -310.2362365722656, + "loss": 0.049, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.187718629837036, + "rewards/margins": 12.276281356811523, + "rewards/rejected": -10.088563919067383, "step": 3380 }, { "epoch": 1.75, "learning_rate": 2.314974182444062e-07, - "logits/chosen": -2.8746933937072754, - "logits/rejected": -2.916006326675415, - "logps/chosen": -223.5010986328125, - "logps/rejected": -337.25439453125, - "loss": 0.0708, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.14647480845451355, - "rewards/margins": 6.7344183921813965, - "rewards/rejected": -6.880892276763916, + "logits/chosen": -2.9192216396331787, + "logits/rejected": -2.9404091835021973, + "logps/chosen": -217.8707733154297, + "logps/rejected": -300.88116455078125, + "loss": 0.0656, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.414391428232193, + "rewards/margins": 9.451571464538574, + "rewards/rejected": -9.037179946899414, "step": 3390 }, { "epoch": 1.76, "learning_rate": 2.305412124689233e-07, - "logits/chosen": -2.765967845916748, - "logits/rejected": -2.7947843074798584, - "logps/chosen": -268.5664978027344, - "logps/rejected": -348.4090576171875, - "loss": 0.1677, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8971278071403503, - "rewards/margins": 8.047765731811523, - "rewards/rejected": -7.150639533996582, + "logits/chosen": -2.827575206756592, + "logits/rejected": -2.8498644828796387, + "logps/chosen": -267.7716979980469, + "logps/rejected": -315.53314208984375, + "loss": 0.038, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.226161003112793, + "rewards/margins": 11.051640510559082, + "rewards/rejected": -9.825479507446289, "step": 3400 }, { "epoch": 1.76, - "eval_logits/chosen": -2.780897855758667, - "eval_logits/rejected": -2.84079647064209, - "eval_logps/chosen": -255.14529418945312, - "eval_logps/rejected": -327.32879638671875, - "eval_loss": 0.49304431676864624, - "eval_rewards/accuracies": 0.8199999928474426, - "eval_rewards/chosen": -0.7465150356292725, - "eval_rewards/margins": 4.372622489929199, - "eval_rewards/rejected": -5.119137763977051, - "eval_runtime": 278.8693, - "eval_samples_per_second": 7.172, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.8585731983184814, + "eval_logits/rejected": -2.8983540534973145, + "eval_logps/chosen": -249.17681884765625, + "eval_logps/rejected": -303.0600280761719, + "eval_loss": 0.36029350757598877, + "eval_rewards/accuracies": 0.8619999885559082, + "eval_rewards/chosen": -0.13623149693012238, + "eval_rewards/margins": 7.59638786315918, + "eval_rewards/rejected": -7.732619285583496, + "eval_runtime": 276.3142, + "eval_samples_per_second": 7.238, + "eval_steps_per_second": 0.452, "step": 3400 }, { "epoch": 1.76, "learning_rate": 2.295850066934404e-07, - "logits/chosen": -2.793631076812744, - "logits/rejected": -2.8191399574279785, - "logps/chosen": -238.7302703857422, - "logps/rejected": -339.9305419921875, - "loss": 0.0874, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6573998332023621, - "rewards/margins": 7.640374660491943, - "rewards/rejected": -6.982974052429199, + "logits/chosen": -2.854020833969116, + "logits/rejected": -2.8471643924713135, + "logps/chosen": -239.21630859375, + "logps/rejected": -291.77484130859375, + "loss": 0.0585, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6085206866264343, + "rewards/margins": 9.598236083984375, + "rewards/rejected": -8.989715576171875, "step": 3410 }, { "epoch": 1.77, "learning_rate": 2.2862880091795752e-07, - "logits/chosen": -2.8327536582946777, - "logits/rejected": -2.8614022731781006, - "logps/chosen": -239.38424682617188, - "logps/rejected": -378.095458984375, - "loss": 0.0628, + "logits/chosen": -2.8657078742980957, + "logits/rejected": -2.8849129676818848, + "logps/chosen": -234.9466552734375, + "logps/rejected": -326.8103332519531, + "loss": 0.0441, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9374256134033203, - "rewards/margins": 9.278172492980957, - "rewards/rejected": -8.340746879577637, + "rewards/chosen": 1.371222734451294, + "rewards/margins": 9.867246627807617, + "rewards/rejected": -8.496023178100586, "step": 3420 }, { "epoch": 1.77, "learning_rate": 2.2767259514247464e-07, - "logits/chosen": -2.7702488899230957, - "logits/rejected": -2.834359645843506, - "logps/chosen": -219.39254760742188, - "logps/rejected": -384.39666748046875, - "loss": 0.0825, - "rewards/accuracies": 0.9375, - "rewards/chosen": 1.6664879322052002, - "rewards/margins": 8.466286659240723, - "rewards/rejected": -6.799798011779785, + "logits/chosen": -2.801828145980835, + "logits/rejected": -2.854135513305664, + "logps/chosen": -215.4505615234375, + "logps/rejected": -381.11016845703125, + "loss": 0.0432, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.9942662715911865, + "rewards/margins": 11.784372329711914, + "rewards/rejected": -9.790106773376465, "step": 3430 }, { "epoch": 1.78, "learning_rate": 2.2671638936699178e-07, - "logits/chosen": -2.7215263843536377, - "logits/rejected": -2.743380069732666, - "logps/chosen": -258.17010498046875, - "logps/rejected": -365.0523986816406, - "loss": 0.0634, + "logits/chosen": -2.7449843883514404, + "logits/rejected": -2.7773513793945312, + "logps/chosen": -251.1278839111328, + "logps/rejected": -324.54095458984375, + "loss": 0.0538, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.524726927280426, - "rewards/margins": 7.3098273277282715, - "rewards/rejected": -6.785101413726807, + "rewards/chosen": 1.2289482355117798, + "rewards/margins": 10.099966049194336, + "rewards/rejected": -8.87101936340332, "step": 3440 }, { "epoch": 1.78, "learning_rate": 2.257601835915089e-07, - "logits/chosen": -2.795828104019165, - "logits/rejected": -2.8122973442077637, - "logps/chosen": -330.8063049316406, - "logps/rejected": -366.1406555175781, - "loss": 0.0586, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.7863120436668396, - "rewards/margins": 7.602275848388672, - "rewards/rejected": -6.815962791442871, + "logits/chosen": -2.7815449237823486, + "logits/rejected": -2.7834222316741943, + "logps/chosen": -331.26763916015625, + "logps/rejected": -343.36260986328125, + "loss": 0.0218, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9003337025642395, + "rewards/margins": 10.755301475524902, + "rewards/rejected": -9.854968070983887, "step": 3450 }, { "epoch": 1.79, "learning_rate": 2.24803977816026e-07, - "logits/chosen": -2.7401373386383057, - "logits/rejected": -2.792062997817993, - "logps/chosen": -262.8241271972656, - "logps/rejected": -324.1911926269531, - "loss": 0.0767, + "logits/chosen": -2.694594383239746, + "logits/rejected": -2.7150683403015137, + "logps/chosen": -267.81951904296875, + "logps/rejected": -286.0038757324219, + "loss": 0.0525, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.5823167562484741, - "rewards/margins": 8.485211372375488, - "rewards/rejected": -6.902894496917725, + "rewards/chosen": 1.082779049873352, + "rewards/margins": 10.620426177978516, + "rewards/rejected": -9.537646293640137, "step": 3460 }, { "epoch": 1.79, "learning_rate": 2.2384777204054313e-07, - "logits/chosen": -2.840707778930664, - "logits/rejected": -2.8512017726898193, - "logps/chosen": -259.43536376953125, - "logps/rejected": -344.71881103515625, - "loss": 0.0943, + "logits/chosen": -2.798046588897705, + "logits/rejected": -2.805220127105713, + "logps/chosen": -259.84417724609375, + "logps/rejected": -313.9005432128906, + "loss": 0.0848, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.8209340572357178, - "rewards/margins": 8.689406394958496, - "rewards/rejected": -6.868472099304199, + "rewards/chosen": 1.7800538539886475, + "rewards/margins": 11.32535457611084, + "rewards/rejected": -9.54530143737793, "step": 3470 }, { "epoch": 1.8, "learning_rate": 2.2289156626506022e-07, - "logits/chosen": -2.7405035495758057, - "logits/rejected": -2.7608306407928467, - "logps/chosen": -306.3294982910156, - "logps/rejected": -349.9527893066406, - "loss": 0.0995, + "logits/chosen": -2.662357807159424, + "logits/rejected": -2.650670051574707, + "logps/chosen": -306.3658142089844, + "logps/rejected": -316.5223083496094, + "loss": 0.0445, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.6940513849258423, - "rewards/margins": 8.543526649475098, - "rewards/rejected": -7.849474906921387, + "rewards/chosen": 0.6904212236404419, + "rewards/margins": 10.881820678710938, + "rewards/rejected": -10.191400527954102, "step": 3480 }, { "epoch": 1.8, "learning_rate": 2.2193536048957733e-07, - "logits/chosen": -2.891343593597412, - "logits/rejected": -2.896864414215088, - "logps/chosen": -275.199462890625, - "logps/rejected": -365.2381591796875, - "loss": 0.1152, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8797968029975891, - "rewards/margins": 7.490570068359375, - "rewards/rejected": -6.610772609710693, + "logits/chosen": -2.8342974185943604, + "logits/rejected": -2.812854290008545, + "logps/chosen": -269.62890625, + "logps/rejected": -320.508544921875, + "loss": 0.0497, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.436875581741333, + "rewards/margins": 9.98438835144043, + "rewards/rejected": -8.54751205444336, "step": 3490 }, { "epoch": 1.81, "learning_rate": 2.2097915471409445e-07, - "logits/chosen": -2.8077056407928467, - "logits/rejected": -2.8688576221466064, - "logps/chosen": -241.04013061523438, - "logps/rejected": -348.31292724609375, - "loss": 0.0581, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.4186904430389404, - "rewards/margins": 8.326966285705566, - "rewards/rejected": -6.908276557922363, + "logits/chosen": -2.7414932250976562, + "logits/rejected": -2.794750213623047, + "logps/chosen": -235.2359619140625, + "logps/rejected": -336.6121826171875, + "loss": 0.0369, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.999106764793396, + "rewards/margins": 11.338249206542969, + "rewards/rejected": -9.339143753051758, "step": 3500 }, { "epoch": 1.81, - "eval_logits/chosen": -2.8191068172454834, - "eval_logits/rejected": -2.874876022338867, - "eval_logps/chosen": -250.5966339111328, - "eval_logps/rejected": -321.31298828125, - "eval_loss": 0.48586151003837585, - "eval_rewards/accuracies": 0.8180000185966492, - "eval_rewards/chosen": -0.29164865612983704, - "eval_rewards/margins": 4.225912094116211, - "eval_rewards/rejected": -4.5175604820251465, - "eval_runtime": 278.7288, - "eval_samples_per_second": 7.175, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.739154100418091, + "eval_logits/rejected": -2.770928382873535, + "eval_logps/chosen": -246.2957305908203, + "eval_logps/rejected": -297.77716064453125, + "eval_loss": 0.337977796792984, + "eval_rewards/accuracies": 0.8640000224113464, + "eval_rewards/chosen": 0.1518779695034027, + "eval_rewards/margins": 7.356210708618164, + "eval_rewards/rejected": -7.204331874847412, + "eval_runtime": 276.5473, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 0.452, "step": 3500 }, { "epoch": 1.81, "learning_rate": 2.200229489386116e-07, - "logits/chosen": -2.8601200580596924, - "logits/rejected": -2.905097484588623, - "logps/chosen": -241.131103515625, - "logps/rejected": -370.8861999511719, - "loss": 0.0544, + "logits/chosen": -2.7742838859558105, + "logits/rejected": -2.7988200187683105, + "logps/chosen": -242.84951782226562, + "logps/rejected": -335.7135925292969, + "loss": 0.0494, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0310213565826416, - "rewards/margins": 8.371079444885254, - "rewards/rejected": -7.340056419372559, + "rewards/chosen": 1.1923649311065674, + "rewards/margins": 10.027400970458984, + "rewards/rejected": -8.835037231445312, "step": 3510 }, { "epoch": 1.82, "learning_rate": 2.190667431631287e-07, - "logits/chosen": -2.8398001194000244, - "logits/rejected": -2.8893535137176514, - "logps/chosen": -271.63861083984375, - "logps/rejected": -412.21075439453125, - "loss": 0.0874, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.676041841506958, - "rewards/margins": 9.154836654663086, - "rewards/rejected": -7.478795051574707, + "logits/chosen": -2.7628655433654785, + "logits/rejected": -2.7690541744232178, + "logps/chosen": -272.36102294921875, + "logps/rejected": -352.4149475097656, + "loss": 0.099, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6473907232284546, + "rewards/margins": 11.878129959106445, + "rewards/rejected": -10.230737686157227, "step": 3520 }, { "epoch": 1.82, "learning_rate": 2.1811053738764582e-07, - "logits/chosen": -2.7711901664733887, - "logits/rejected": -2.8552980422973633, - "logps/chosen": -193.1044921875, - "logps/rejected": -350.78619384765625, - "loss": 0.0921, + "logits/chosen": -2.6771597862243652, + "logits/rejected": -2.7449512481689453, + "logps/chosen": -190.2683563232422, + "logps/rejected": -329.6096496582031, + "loss": 0.0735, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3733131885528564, - "rewards/margins": 8.322312355041504, - "rewards/rejected": -6.948998928070068, + "rewards/chosen": 1.656937599182129, + "rewards/margins": 11.569732666015625, + "rewards/rejected": -9.912796020507812, "step": 3530 }, { "epoch": 1.83, "learning_rate": 2.1715433161216294e-07, - "logits/chosen": -2.8452248573303223, - "logits/rejected": -2.8975632190704346, - "logps/chosen": -254.40078735351562, - "logps/rejected": -356.9154052734375, - "loss": 0.4464, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0389509201049805, - "rewards/margins": 7.992854118347168, - "rewards/rejected": -6.9539031982421875, + "logits/chosen": -2.73295521736145, + "logits/rejected": -2.738770008087158, + "logps/chosen": -256.7109680175781, + "logps/rejected": -278.31109619140625, + "loss": 0.0529, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8066117167472839, + "rewards/margins": 11.241575241088867, + "rewards/rejected": -10.434962272644043, "step": 3540 }, { "epoch": 1.83, "learning_rate": 2.1619812583668005e-07, - "logits/chosen": -2.9288828372955322, - "logits/rejected": -2.958012104034424, - "logps/chosen": -227.0967559814453, - "logps/rejected": -314.05908203125, - "loss": 0.0829, + "logits/chosen": -2.818835735321045, + "logits/rejected": -2.8273138999938965, + "logps/chosen": -220.4379119873047, + "logps/rejected": -290.4381103515625, + "loss": 0.065, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.8639600872993469, - "rewards/margins": 8.197954177856445, - "rewards/rejected": -7.3339948654174805, + "rewards/chosen": 1.5298454761505127, + "rewards/margins": 11.439474105834961, + "rewards/rejected": -9.909627914428711, "step": 3550 }, { "epoch": 1.84, "learning_rate": 2.1524192006119714e-07, - "logits/chosen": -2.892519235610962, - "logits/rejected": -2.915253162384033, - "logps/chosen": -230.73831176757812, - "logps/rejected": -333.3849792480469, - "loss": 0.099, + "logits/chosen": -2.8162848949432373, + "logits/rejected": -2.816516399383545, + "logps/chosen": -225.169677734375, + "logps/rejected": -296.16351318359375, + "loss": 0.0727, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6762139201164246, - "rewards/margins": 7.587038516998291, - "rewards/rejected": -6.910824775695801, + "rewards/chosen": 1.2321860790252686, + "rewards/margins": 10.252374649047852, + "rewards/rejected": -9.02018928527832, "step": 3560 }, { "epoch": 1.84, "learning_rate": 2.1428571428571426e-07, - "logits/chosen": -2.9029574394226074, - "logits/rejected": -2.9076485633850098, - "logps/chosen": -245.4383544921875, - "logps/rejected": -291.6465148925781, - "loss": 0.0601, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6226890087127686, - "rewards/margins": 7.25844669342041, - "rewards/rejected": -6.6357574462890625, + "logits/chosen": -2.841700792312622, + "logits/rejected": -2.8049709796905518, + "logps/chosen": -241.82815551757812, + "logps/rejected": -252.5695037841797, + "loss": 0.0396, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0010608434677124, + "rewards/margins": 9.947206497192383, + "rewards/rejected": -8.946146011352539, "step": 3570 }, { "epoch": 1.85, "learning_rate": 2.133295085102314e-07, - "logits/chosen": -2.7956185340881348, - "logits/rejected": -2.822728157043457, - "logps/chosen": -259.4379577636719, - "logps/rejected": -321.2564392089844, - "loss": 0.069, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.7765030860900879, - "rewards/margins": 7.667203426361084, - "rewards/rejected": -6.890700340270996, + "logits/chosen": -2.7332897186279297, + "logits/rejected": -2.7583565711975098, + "logps/chosen": -257.93804931640625, + "logps/rejected": -295.90740966796875, + "loss": 0.0264, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9264896512031555, + "rewards/margins": 10.437051773071289, + "rewards/rejected": -9.510560989379883, "step": 3580 }, { "epoch": 1.85, "learning_rate": 2.1237330273474851e-07, - "logits/chosen": -2.8928589820861816, - "logits/rejected": -2.904470920562744, - "logps/chosen": -287.71087646484375, - "logps/rejected": -337.8426818847656, - "loss": 0.0637, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.0680280923843384, - "rewards/margins": 7.8063483238220215, - "rewards/rejected": -6.738319396972656, + "logits/chosen": -2.876116991043091, + "logits/rejected": -2.8713741302490234, + "logps/chosen": -284.0206298828125, + "logps/rejected": -300.7164306640625, + "loss": 0.0623, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.4366271495819092, + "rewards/margins": 9.828795433044434, + "rewards/rejected": -8.392168045043945, "step": 3590 }, { "epoch": 1.86, "learning_rate": 2.1141709695926563e-07, - "logits/chosen": -2.8569111824035645, - "logits/rejected": -2.9227359294891357, - "logps/chosen": -224.90072631835938, - "logps/rejected": -317.1742858886719, - "loss": 0.053, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.06578528881073, - "rewards/margins": 8.363664627075195, - "rewards/rejected": -7.297879695892334, + "logits/chosen": -2.838569402694702, + "logits/rejected": -2.874798059463501, + "logps/chosen": -224.54226684570312, + "logps/rejected": -287.66119384765625, + "loss": 0.05, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1016333103179932, + "rewards/margins": 10.594505310058594, + "rewards/rejected": -9.49287223815918, "step": 3600 }, { "epoch": 1.86, - "eval_logits/chosen": -2.830040216445923, - "eval_logits/rejected": -2.888530731201172, - "eval_logps/chosen": -253.77218627929688, - "eval_logps/rejected": -326.65185546875, - "eval_loss": 0.4978037178516388, - "eval_rewards/accuracies": 0.8220000267028809, - "eval_rewards/chosen": -0.6092034578323364, - "eval_rewards/margins": 4.442237377166748, - "eval_rewards/rejected": -5.051440238952637, - "eval_runtime": 278.5418, - "eval_samples_per_second": 7.18, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.831789016723633, + "eval_logits/rejected": -2.867061138153076, + "eval_logps/chosen": -247.27340698242188, + "eval_logps/rejected": -299.8389892578125, + "eval_loss": 0.3444737493991852, + "eval_rewards/accuracies": 0.8659999966621399, + "eval_rewards/chosen": 0.05410884693264961, + "eval_rewards/margins": 7.464624881744385, + "eval_rewards/rejected": -7.410516262054443, + "eval_runtime": 276.5314, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 0.452, "step": 3600 }, { "epoch": 1.86, "learning_rate": 2.1046089118378275e-07, - "logits/chosen": -2.866051197052002, - "logits/rejected": -2.8638253211975098, - "logps/chosen": -257.161865234375, - "logps/rejected": -354.1871643066406, - "loss": 0.1621, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5690984725952148, - "rewards/margins": 8.197249412536621, - "rewards/rejected": -7.628150939941406, + "logits/chosen": -2.8812015056610107, + "logits/rejected": -2.8616695404052734, + "logps/chosen": -262.34967041015625, + "logps/rejected": -331.433349609375, + "loss": 0.0796, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.7427822351455688, + "rewards/margins": 10.863062858581543, + "rewards/rejected": -10.120279312133789, "step": 3610 }, { "epoch": 1.87, "learning_rate": 2.0950468540829986e-07, - "logits/chosen": -2.8620736598968506, - "logits/rejected": -2.8946380615234375, - "logps/chosen": -248.4556121826172, - "logps/rejected": -326.19573974609375, - "loss": 0.109, + "logits/chosen": -2.862776041030884, + "logits/rejected": -2.9192395210266113, + "logps/chosen": -245.2966766357422, + "logps/rejected": -301.38946533203125, + "loss": 0.0715, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.479146957397461, - "rewards/margins": 7.966286659240723, - "rewards/rejected": -6.4871392250061035, + "rewards/chosen": 1.7950388193130493, + "rewards/margins": 11.199941635131836, + "rewards/rejected": -9.404902458190918, "step": 3620 }, { "epoch": 1.87, "learning_rate": 2.0854847963281698e-07, - "logits/chosen": -2.8275606632232666, - "logits/rejected": -2.8644332885742188, - "logps/chosen": -289.37261962890625, - "logps/rejected": -347.7831726074219, - "loss": 0.1032, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.3235790729522705, - "rewards/margins": 7.77987003326416, - "rewards/rejected": -6.456290245056152, + "logits/chosen": -2.826916456222534, + "logits/rejected": -2.821584939956665, + "logps/chosen": -286.9228820800781, + "logps/rejected": -304.72406005859375, + "loss": 0.0566, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.570765733718872, + "rewards/margins": 9.152833938598633, + "rewards/rejected": -7.582067966461182, "step": 3630 }, { "epoch": 1.88, "learning_rate": 2.0759227385733407e-07, - "logits/chosen": -2.7326953411102295, - "logits/rejected": -2.7684574127197266, - "logps/chosen": -280.19427490234375, - "logps/rejected": -348.9992370605469, - "loss": 0.0828, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.8279417157173157, - "rewards/margins": 7.6720428466796875, - "rewards/rejected": -6.844099998474121, + "logits/chosen": -2.7103796005249023, + "logits/rejected": -2.7242186069488525, + "logps/chosen": -284.1620788574219, + "logps/rejected": -318.30224609375, + "loss": 0.0917, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.43115752935409546, + "rewards/margins": 9.300579071044922, + "rewards/rejected": -8.86942195892334, "step": 3640 }, { "epoch": 1.88, "learning_rate": 2.066360680818512e-07, - "logits/chosen": -2.8425121307373047, - "logits/rejected": -2.85652494430542, - "logps/chosen": -328.8985595703125, - "logps/rejected": -353.4617004394531, - "loss": 0.1833, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.654839277267456, - "rewards/margins": 8.085906028747559, - "rewards/rejected": -6.43106746673584, + "logits/chosen": -2.8609883785247803, + "logits/rejected": -2.8407845497131348, + "logps/chosen": -330.72491455078125, + "logps/rejected": -300.6785888671875, + "loss": 0.0496, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4722001552581787, + "rewards/margins": 9.851592063903809, + "rewards/rejected": -8.379392623901367, "step": 3650 }, { "epoch": 1.89, "learning_rate": 2.0567986230636832e-07, - "logits/chosen": -2.82452392578125, - "logits/rejected": -2.8541016578674316, - "logps/chosen": -229.6313018798828, - "logps/rejected": -334.2443542480469, - "loss": 0.0618, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.3441283702850342, - "rewards/margins": 8.709617614746094, - "rewards/rejected": -7.365488529205322, + "logits/chosen": -2.8376283645629883, + "logits/rejected": -2.8254470825195312, + "logps/chosen": -227.9430389404297, + "logps/rejected": -277.533935546875, + "loss": 0.0293, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5125881433486938, + "rewards/margins": 11.706796646118164, + "rewards/rejected": -10.194208145141602, "step": 3660 }, { "epoch": 1.89, "learning_rate": 2.0472365653088544e-07, - "logits/chosen": -2.8438940048217773, - "logits/rejected": -2.852888345718384, - "logps/chosen": -275.33172607421875, - "logps/rejected": -345.549560546875, - "loss": 0.1453, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.8664258718490601, - "rewards/margins": 7.481070041656494, - "rewards/rejected": -6.6146440505981445, + "logits/chosen": -2.8402044773101807, + "logits/rejected": -2.8416686058044434, + "logps/chosen": -271.40899658203125, + "logps/rejected": -313.9754638671875, + "loss": 0.0849, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2610431909561157, + "rewards/margins": 9.34048080444336, + "rewards/rejected": -8.079437255859375, "step": 3670 }, { "epoch": 1.9, "learning_rate": 2.0376745075540256e-07, - "logits/chosen": -2.86684513092041, - "logits/rejected": -2.9170501232147217, - "logps/chosen": -347.53875732421875, - "logps/rejected": -356.6395568847656, - "loss": 0.085, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.8831812739372253, - "rewards/margins": 8.366857528686523, - "rewards/rejected": -7.483675956726074, + "logits/chosen": -2.8483176231384277, + "logits/rejected": -2.859550952911377, + "logps/chosen": -344.5357971191406, + "logps/rejected": -313.8351135253906, + "loss": 0.0453, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.1983067989349365, + "rewards/margins": 11.868532180786133, + "rewards/rejected": -10.670225143432617, "step": 3680 }, { "epoch": 1.91, "learning_rate": 2.0281124497991967e-07, - "logits/chosen": -2.8049659729003906, - "logits/rejected": -2.8793957233428955, - "logps/chosen": -242.1417999267578, - "logps/rejected": -375.3875427246094, - "loss": 0.0978, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 1.4780528545379639, - "rewards/margins": 8.171385765075684, - "rewards/rejected": -6.693333625793457, + "logits/chosen": -2.787320137023926, + "logits/rejected": -2.847799777984619, + "logps/chosen": -237.95449829101562, + "logps/rejected": -329.31707763671875, + "loss": 0.1436, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8965975046157837, + "rewards/margins": 11.685870170593262, + "rewards/rejected": -9.789273262023926, "step": 3690 }, { "epoch": 1.91, "learning_rate": 2.018550392044368e-07, - "logits/chosen": -2.8545122146606445, - "logits/rejected": -2.8557956218719482, - "logps/chosen": -269.9671936035156, - "logps/rejected": -395.208251953125, - "loss": 0.0603, + "logits/chosen": -2.843174457550049, + "logits/rejected": -2.839344024658203, + "logps/chosen": -264.2933654785156, + "logps/rejected": -351.58770751953125, + "loss": 0.0576, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.7889567613601685, - "rewards/margins": 8.470499038696289, - "rewards/rejected": -7.681540489196777, + "rewards/chosen": 1.3560373783111572, + "rewards/margins": 11.294261932373047, + "rewards/rejected": -9.938224792480469, "step": 3700 }, { "epoch": 1.91, - "eval_logits/chosen": -2.8074629306793213, - "eval_logits/rejected": -2.8709890842437744, - "eval_logps/chosen": -255.2186737060547, - "eval_logps/rejected": -326.8601989746094, - "eval_loss": 0.48304229974746704, - "eval_rewards/accuracies": 0.8059999942779541, - "eval_rewards/chosen": -0.7538514137268066, - "eval_rewards/margins": 4.318424224853516, - "eval_rewards/rejected": -5.072276592254639, - "eval_runtime": 278.8882, - "eval_samples_per_second": 7.171, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.800122022628784, + "eval_logits/rejected": -2.83972430229187, + "eval_logps/chosen": -247.5421600341797, + "eval_logps/rejected": -298.9884948730469, + "eval_loss": 0.34611743688583374, + "eval_rewards/accuracies": 0.871999979019165, + "eval_rewards/chosen": 0.027235815301537514, + "eval_rewards/margins": 7.352701663970947, + "eval_rewards/rejected": -7.325466632843018, + "eval_runtime": 276.4761, + "eval_samples_per_second": 7.234, + "eval_steps_per_second": 0.452, "step": 3700 }, { "epoch": 1.92, "learning_rate": 2.0089883342895388e-07, - "logits/chosen": -2.8379898071289062, - "logits/rejected": -2.8898370265960693, - "logps/chosen": -261.80242919921875, - "logps/rejected": -306.7249755859375, - "loss": 0.07, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.4923166036605835, - "rewards/margins": 7.94171667098999, - "rewards/rejected": -6.449400424957275, + "logits/chosen": -2.8209924697875977, + "logits/rejected": -2.837202548980713, + "logps/chosen": -266.49554443359375, + "logps/rejected": -279.29290771484375, + "loss": 0.0443, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.605319619178772, + "rewards/margins": 10.836023330688477, + "rewards/rejected": -9.23070240020752, "step": 3710 }, { "epoch": 1.92, "learning_rate": 1.9994262765347102e-07, - "logits/chosen": -2.8545596599578857, - "logits/rejected": -2.857635974884033, - "logps/chosen": -302.4580993652344, - "logps/rejected": -325.986083984375, - "loss": 0.0658, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.9443203210830688, - "rewards/margins": 8.127703666687012, - "rewards/rejected": -7.183383941650391, + "logits/chosen": -2.8240294456481934, + "logits/rejected": -2.8355374336242676, + "logps/chosen": -297.0013732910156, + "logps/rejected": -293.91693115234375, + "loss": 0.0839, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4894896745681763, + "rewards/margins": 11.263047218322754, + "rewards/rejected": -9.773557662963867, "step": 3720 }, { "epoch": 1.93, "learning_rate": 1.9898642187798813e-07, - "logits/chosen": -2.745156764984131, - "logits/rejected": -2.785728931427002, - "logps/chosen": -261.17877197265625, - "logps/rejected": -370.0102844238281, - "loss": 0.0484, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.6731252670288086, - "rewards/margins": 9.124866485595703, - "rewards/rejected": -7.4517412185668945, + "logits/chosen": -2.709597587585449, + "logits/rejected": -2.7396557331085205, + "logps/chosen": -255.9827423095703, + "logps/rejected": -359.24285888671875, + "loss": 0.0397, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.191983938217163, + "rewards/margins": 11.850685119628906, + "rewards/rejected": -9.658700942993164, "step": 3730 }, { "epoch": 1.93, "learning_rate": 1.9803021610250525e-07, - "logits/chosen": -2.8896374702453613, - "logits/rejected": -2.9223759174346924, - "logps/chosen": -214.2520294189453, - "logps/rejected": -315.9022216796875, - "loss": 0.1006, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.5602417588233948, - "rewards/margins": 6.8746466636657715, - "rewards/rejected": -6.314405918121338, + "logits/chosen": -2.8636982440948486, + "logits/rejected": -2.846600294113159, + "logps/chosen": -210.83261108398438, + "logps/rejected": -284.43341064453125, + "loss": 0.0541, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.9021838903427124, + "rewards/margins": 9.569524765014648, + "rewards/rejected": -8.667341232299805, "step": 3740 }, { "epoch": 1.94, "learning_rate": 1.9707401032702237e-07, - "logits/chosen": -2.772069215774536, - "logits/rejected": -2.7865991592407227, - "logps/chosen": -257.6865234375, - "logps/rejected": -336.3182373046875, - "loss": 0.0962, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9006959795951843, - "rewards/margins": 7.774443626403809, - "rewards/rejected": -6.8737473487854, + "logits/chosen": -2.7177462577819824, + "logits/rejected": -2.708138942718506, + "logps/chosen": -255.06057739257812, + "logps/rejected": -314.787841796875, + "loss": 0.1153, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.1623927354812622, + "rewards/margins": 9.426264762878418, + "rewards/rejected": -8.263872146606445, "step": 3750 }, { "epoch": 1.94, "learning_rate": 1.9611780455153948e-07, - "logits/chosen": -2.8456928730010986, - "logits/rejected": -2.934762477874756, - "logps/chosen": -185.00128173828125, - "logps/rejected": -335.33245849609375, - "loss": 0.0523, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1279864311218262, - "rewards/margins": 7.620774745941162, - "rewards/rejected": -6.4927873611450195, + "logits/chosen": -2.8263397216796875, + "logits/rejected": -2.8843884468078613, + "logps/chosen": -177.483642578125, + "logps/rejected": -303.78265380859375, + "loss": 0.0322, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.8735061883926392, + "rewards/margins": 10.674055099487305, + "rewards/rejected": -8.800549507141113, "step": 3760 }, { "epoch": 1.95, "learning_rate": 1.951615987760566e-07, - "logits/chosen": -2.7873167991638184, - "logits/rejected": -2.8468079566955566, - "logps/chosen": -234.56436157226562, - "logps/rejected": -342.3611145019531, - "loss": 0.0705, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.4214778542518616, - "rewards/margins": 7.9835524559021, - "rewards/rejected": -7.562074184417725, + "logits/chosen": -2.749332904815674, + "logits/rejected": -2.7594761848449707, + "logps/chosen": -234.3672332763672, + "logps/rejected": -309.1501770019531, + "loss": 0.0313, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7482800483703613, + "rewards/margins": 11.097860336303711, + "rewards/rejected": -10.349580764770508, "step": 3770 }, { "epoch": 1.95, "learning_rate": 1.942053930005737e-07, - "logits/chosen": -2.926084041595459, - "logits/rejected": -2.9593288898468018, - "logps/chosen": -222.12442016601562, - "logps/rejected": -343.7054748535156, - "loss": 0.0495, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.9796302318572998, - "rewards/margins": 9.76245403289795, - "rewards/rejected": -7.7828240394592285, + "logits/chosen": -2.913954257965088, + "logits/rejected": -2.9320945739746094, + "logps/chosen": -221.63943481445312, + "logps/rejected": -337.4884338378906, + "loss": 0.0418, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 2.0281310081481934, + "rewards/margins": 11.20705795288086, + "rewards/rejected": -9.178927421569824, "step": 3780 }, { "epoch": 1.96, "learning_rate": 1.9324918722509086e-07, - "logits/chosen": -2.8268990516662598, - "logits/rejected": -2.892279624938965, - "logps/chosen": -268.910888671875, - "logps/rejected": -384.81329345703125, - "loss": 0.1037, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7527690529823303, - "rewards/margins": 8.045644760131836, - "rewards/rejected": -7.29287576675415, + "logits/chosen": -2.7262675762176514, + "logits/rejected": -2.7555019855499268, + "logps/chosen": -263.38702392578125, + "logps/rejected": -355.05194091796875, + "loss": 0.0866, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3051530122756958, + "rewards/margins": 10.727740287780762, + "rewards/rejected": -9.422586441040039, "step": 3790 }, { "epoch": 1.96, "learning_rate": 1.9229298144960794e-07, - "logits/chosen": -2.876244068145752, - "logits/rejected": -2.9497416019439697, - "logps/chosen": -252.00454711914062, - "logps/rejected": -334.44183349609375, - "loss": 0.1269, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8252018094062805, - "rewards/margins": 7.863356590270996, - "rewards/rejected": -7.038155555725098, + "logits/chosen": -2.756274938583374, + "logits/rejected": -2.8092291355133057, + "logps/chosen": -248.3235321044922, + "logps/rejected": -302.1200256347656, + "loss": 0.0632, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.191517949104309, + "rewards/margins": 10.93460750579834, + "rewards/rejected": -9.74308967590332, "step": 3800 }, { "epoch": 1.96, - "eval_logits/chosen": -2.8554041385650635, - "eval_logits/rejected": -2.912135601043701, - "eval_logps/chosen": -252.01141357421875, - "eval_logps/rejected": -321.33148193359375, - "eval_loss": 0.4793297350406647, - "eval_rewards/accuracies": 0.8159999847412109, - "eval_rewards/chosen": -0.43312713503837585, - "eval_rewards/margins": 4.086278438568115, - "eval_rewards/rejected": -4.519405364990234, - "eval_runtime": 278.7658, - "eval_samples_per_second": 7.174, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.732269287109375, + "eval_logits/rejected": -2.7688071727752686, + "eval_logps/chosen": -249.07334899902344, + "eval_logps/rejected": -301.69573974609375, + "eval_loss": 0.3487098217010498, + "eval_rewards/accuracies": 0.8659999966621399, + "eval_rewards/chosen": -0.12588489055633545, + "eval_rewards/margins": 7.470304012298584, + "eval_rewards/rejected": -7.596189022064209, + "eval_runtime": 276.2019, + "eval_samples_per_second": 7.241, + "eval_steps_per_second": 0.453, "step": 3800 }, { "epoch": 1.97, "learning_rate": 1.9133677567412506e-07, - "logits/chosen": -2.876587152481079, - "logits/rejected": -2.9200339317321777, - "logps/chosen": -290.6847839355469, - "logps/rejected": -328.954833984375, - "loss": 0.0773, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8511034250259399, - "rewards/margins": 7.0640459060668945, - "rewards/rejected": -6.212942123413086, + "logits/chosen": -2.7639856338500977, + "logits/rejected": -2.7802748680114746, + "logps/chosen": -287.05621337890625, + "logps/rejected": -302.2830505371094, + "loss": 0.0652, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2139687538146973, + "rewards/margins": 11.25764274597168, + "rewards/rejected": -10.043672561645508, "step": 3810 }, { "epoch": 1.97, "learning_rate": 1.9038056989864218e-07, - "logits/chosen": -2.860901355743408, - "logits/rejected": -2.9103498458862305, - "logps/chosen": -194.5924072265625, - "logps/rejected": -308.28668212890625, - "loss": 0.0948, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 1.2943288087844849, - "rewards/margins": 6.222657203674316, - "rewards/rejected": -4.928328514099121, + "logits/chosen": -2.7388968467712402, + "logits/rejected": -2.7806904315948486, + "logps/chosen": -197.17132568359375, + "logps/rejected": -304.80859375, + "loss": 0.051, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0389102697372437, + "rewards/margins": 8.792823791503906, + "rewards/rejected": -7.753912925720215, "step": 3820 }, { "epoch": 1.98, "learning_rate": 1.894243641231593e-07, - "logits/chosen": -2.7092089653015137, - "logits/rejected": -2.7530195713043213, - "logps/chosen": -235.4855194091797, - "logps/rejected": -291.153564453125, - "loss": 0.1392, + "logits/chosen": -2.5467875003814697, + "logits/rejected": -2.580683708190918, + "logps/chosen": -243.20046997070312, + "logps/rejected": -269.69976806640625, + "loss": 0.0869, "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6584703922271729, - "rewards/margins": 6.632231712341309, - "rewards/rejected": -5.973761081695557, + "rewards/chosen": -0.11365087330341339, + "rewards/margins": 9.541735649108887, + "rewards/rejected": -9.655385971069336, "step": 3830 }, { "epoch": 1.98, "learning_rate": 1.884681583476764e-07, - "logits/chosen": -2.8745875358581543, - "logits/rejected": -2.9260880947113037, - "logps/chosen": -281.93890380859375, - "logps/rejected": -325.87213134765625, - "loss": 0.0558, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5928620100021362, - "rewards/margins": 6.597723484039307, - "rewards/rejected": -6.004860877990723, + "logits/chosen": -2.739814043045044, + "logits/rejected": -2.7556099891662598, + "logps/chosen": -286.0456237792969, + "logps/rejected": -303.90777587890625, + "loss": 0.0742, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.6231164932250977, + "rewards/margins": 10.27683162689209, + "rewards/rejected": -9.653716087341309, "step": 3840 }, { "epoch": 1.99, "learning_rate": 1.8751195257219352e-07, - "logits/chosen": -2.8994739055633545, - "logits/rejected": -2.9488778114318848, - "logps/chosen": -188.28561401367188, - "logps/rejected": -319.95281982421875, - "loss": 0.0772, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0726709365844727, - "rewards/margins": 6.868597984313965, - "rewards/rejected": -5.79592752456665, + "logits/chosen": -2.7949862480163574, + "logits/rejected": -2.824547052383423, + "logps/chosen": -187.23226928710938, + "logps/rejected": -311.6201171875, + "loss": 0.0471, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.1780050992965698, + "rewards/margins": 10.701109886169434, + "rewards/rejected": -9.523103713989258, "step": 3850 }, { "epoch": 1.99, "learning_rate": 1.8655574679671067e-07, - "logits/chosen": -2.879817485809326, - "logits/rejected": -2.9309778213500977, - "logps/chosen": -252.77963256835938, - "logps/rejected": -353.0110168457031, - "loss": 0.1026, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.7972162961959839, - "rewards/margins": 7.9078192710876465, - "rewards/rejected": -7.110602378845215, + "logits/chosen": -2.7550477981567383, + "logits/rejected": -2.7934212684631348, + "logps/chosen": -251.92672729492188, + "logps/rejected": -326.62957763671875, + "loss": 0.0817, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8810757398605347, + "rewards/margins": 10.286468505859375, + "rewards/rejected": -9.40539264678955, "step": 3860 }, { "epoch": 2.0, "learning_rate": 1.8559954102122778e-07, - "logits/chosen": -2.9028608798980713, - "logits/rejected": -2.8901915550231934, - "logps/chosen": -253.7400665283203, - "logps/rejected": -343.6603698730469, - "loss": 0.0992, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 1.4057618379592896, - "rewards/margins": 7.818563938140869, - "rewards/rejected": -6.412802696228027, + "logits/chosen": -2.7722268104553223, + "logits/rejected": -2.7459523677825928, + "logps/chosen": -260.33624267578125, + "logps/rejected": -287.18121337890625, + "loss": 0.0733, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7461458444595337, + "rewards/margins": 10.466187477111816, + "rewards/rejected": -9.720041275024414, "step": 3870 }, { "epoch": 2.0, "learning_rate": 1.8464333524574487e-07, - "logits/chosen": -2.8687682151794434, - "logits/rejected": -2.9383254051208496, - "logps/chosen": -229.3580322265625, - "logps/rejected": -317.1125183105469, - "loss": 0.0279, + "logits/chosen": -2.744771957397461, + "logits/rejected": -2.779881715774536, + "logps/chosen": -231.6518096923828, + "logps/rejected": -295.96417236328125, + "loss": 0.0334, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3208388090133667, - "rewards/margins": 8.077336311340332, - "rewards/rejected": -6.756496429443359, + "rewards/chosen": 1.0984671115875244, + "rewards/margins": 10.687906265258789, + "rewards/rejected": -9.58944034576416, "step": 3880 }, { "epoch": 2.01, "learning_rate": 1.8368712947026199e-07, - "logits/chosen": -2.865574359893799, - "logits/rejected": -2.918137550354004, - "logps/chosen": -284.77618408203125, - "logps/rejected": -341.1562805175781, - "loss": 0.0144, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 2.1757142543792725, - "rewards/margins": 8.95848274230957, - "rewards/rejected": -6.782768249511719, + "logits/chosen": -2.741731882095337, + "logits/rejected": -2.7852609157562256, + "logps/chosen": -284.8831787109375, + "logps/rejected": -306.7997741699219, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1650390625, + "rewards/margins": 11.534947395324707, + "rewards/rejected": -9.36990737915039, "step": 3890 }, { "epoch": 2.01, "learning_rate": 1.827309236947791e-07, - "logits/chosen": -2.792001247406006, - "logits/rejected": -2.859900712966919, - "logps/chosen": -250.72314453125, - "logps/rejected": -352.9478454589844, - "loss": 0.0191, + "logits/chosen": -2.6648335456848145, + "logits/rejected": -2.702486038208008, + "logps/chosen": -255.36068725585938, + "logps/rejected": -309.9033508300781, + "loss": 0.016, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8500796556472778, - "rewards/margins": 9.03044605255127, - "rewards/rejected": -8.180366516113281, + "rewards/chosen": 0.7788647413253784, + "rewards/margins": 9.981348991394043, + "rewards/rejected": -9.202483177185059, "step": 3900 }, { "epoch": 2.01, - "eval_logits/chosen": -2.824638843536377, - "eval_logits/rejected": -2.885737657546997, - "eval_logps/chosen": -252.56590270996094, - "eval_logps/rejected": -326.02313232421875, - "eval_loss": 0.48027554154396057, - "eval_rewards/accuracies": 0.8159999847412109, - "eval_rewards/chosen": -0.48857322335243225, - "eval_rewards/margins": 4.499995231628418, - "eval_rewards/rejected": -4.988568305969238, - "eval_runtime": 278.5534, - "eval_samples_per_second": 7.18, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.7075021266937256, + "eval_logits/rejected": -2.7418947219848633, + "eval_logps/chosen": -248.31671142578125, + "eval_logps/rejected": -301.7951965332031, + "eval_loss": 0.3600015938282013, + "eval_rewards/accuracies": 0.871999979019165, + "eval_rewards/chosen": -0.050220977514982224, + "eval_rewards/margins": 7.555914402008057, + "eval_rewards/rejected": -7.606135845184326, + "eval_runtime": 276.337, + "eval_samples_per_second": 7.238, + "eval_steps_per_second": 0.452, "step": 3900 }, { "epoch": 2.02, "learning_rate": 1.8177471791929622e-07, - "logits/chosen": -2.786916971206665, - "logits/rejected": -2.8448758125305176, - "logps/chosen": -242.95535278320312, - "logps/rejected": -407.13848876953125, - "loss": 0.0124, + "logits/chosen": -2.6450483798980713, + "logits/rejected": -2.669512987136841, + "logps/chosen": -239.4267120361328, + "logps/rejected": -356.134033203125, + "loss": 0.008, "rewards/accuracies": 1.0, - "rewards/chosen": 1.6383663415908813, - "rewards/margins": 9.484217643737793, - "rewards/rejected": -7.845850944519043, + "rewards/chosen": 1.9929059743881226, + "rewards/margins": 11.572668075561523, + "rewards/rejected": -9.579761505126953, "step": 3910 }, { "epoch": 2.02, "learning_rate": 1.8081851214381333e-07, - "logits/chosen": -2.750891923904419, - "logits/rejected": -2.7767796516418457, - "logps/chosen": -272.3475646972656, - "logps/rejected": -429.21502685546875, - "loss": 0.0221, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9751347303390503, - "rewards/margins": 10.68256950378418, - "rewards/rejected": -9.70743465423584, + "logits/chosen": -2.633150100708008, + "logits/rejected": -2.6152095794677734, + "logps/chosen": -269.7049560546875, + "logps/rejected": -419.21173095703125, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.237261176109314, + "rewards/margins": 13.594688415527344, + "rewards/rejected": -12.357429504394531, "step": 3920 }, { "epoch": 2.03, "learning_rate": 1.7986230636833047e-07, - "logits/chosen": -2.8120033740997314, - "logits/rejected": -2.8324341773986816, - "logps/chosen": -163.6907196044922, - "logps/rejected": -321.2008972167969, - "loss": 0.0197, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.0225517749786377, - "rewards/margins": 8.076213836669922, - "rewards/rejected": -7.0536627769470215, + "logits/chosen": -2.715364456176758, + "logits/rejected": -2.7181763648986816, + "logps/chosen": -164.25827026367188, + "logps/rejected": -304.0223083496094, + "loss": 0.0266, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.029505729675293, + "rewards/margins": 11.17983627319336, + "rewards/rejected": -10.15032958984375, "step": 3930 }, { "epoch": 2.03, "learning_rate": 1.789061005928476e-07, - "logits/chosen": -2.8280625343322754, - "logits/rejected": -2.853699207305908, - "logps/chosen": -227.91928100585938, - "logps/rejected": -318.5960998535156, - "loss": 0.0209, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3885648250579834, - "rewards/margins": 9.088371276855469, - "rewards/rejected": -7.699806213378906, + "logits/chosen": -2.7229602336883545, + "logits/rejected": -2.7283406257629395, + "logps/chosen": -228.40771484375, + "logps/rejected": -291.13946533203125, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.772528052330017, + "rewards/margins": 11.673210144042969, + "rewards/rejected": -9.900681495666504, "step": 3940 }, { "epoch": 2.04, "learning_rate": 1.7794989481736468e-07, - "logits/chosen": -2.839874267578125, - "logits/rejected": -2.8559255599975586, - "logps/chosen": -265.1712951660156, - "logps/rejected": -363.5159912109375, - "loss": 0.0137, + "logits/chosen": -2.7340428829193115, + "logits/rejected": -2.715100049972534, + "logps/chosen": -266.5819091796875, + "logps/rejected": -340.165283203125, + "loss": 0.0114, "rewards/accuracies": 1.0, - "rewards/chosen": 1.3292443752288818, - "rewards/margins": 9.883804321289062, - "rewards/rejected": -8.554559707641602, + "rewards/chosen": 1.188472032546997, + "rewards/margins": 11.987593650817871, + "rewards/rejected": -10.799120903015137, "step": 3950 }, { "epoch": 2.04, "learning_rate": 1.769936890418818e-07, - "logits/chosen": -2.7737081050872803, - "logits/rejected": -2.808875560760498, - "logps/chosen": -267.7574768066406, - "logps/rejected": -366.5141906738281, - "loss": 0.0159, + "logits/chosen": -2.670009136199951, + "logits/rejected": -2.6639244556427, + "logps/chosen": -272.32891845703125, + "logps/rejected": -329.36920166015625, + "loss": 0.0065, "rewards/accuracies": 1.0, - "rewards/chosen": 1.896043062210083, - "rewards/margins": 9.658696174621582, - "rewards/rejected": -7.762652397155762, + "rewards/chosen": 1.3981056213378906, + "rewards/margins": 11.676907539367676, + "rewards/rejected": -10.278802871704102, "step": 3960 }, { "epoch": 2.05, "learning_rate": 1.760374832663989e-07, - "logits/chosen": -2.759280204772949, - "logits/rejected": -2.753592014312744, - "logps/chosen": -259.1358947753906, - "logps/rejected": -381.5042724609375, - "loss": 0.0097, + "logits/chosen": -2.645658254623413, + "logits/rejected": -2.6252591609954834, + "logps/chosen": -259.00213623046875, + "logps/rejected": -346.74163818359375, + "loss": 0.008, "rewards/accuracies": 1.0, - "rewards/chosen": 0.4964958727359772, - "rewards/margins": 8.723490715026855, - "rewards/rejected": -8.226995468139648, + "rewards/chosen": 0.5098708868026733, + "rewards/margins": 11.310583114624023, + "rewards/rejected": -10.800712585449219, "step": 3970 }, { "epoch": 2.05, "learning_rate": 1.7508127749091603e-07, - "logits/chosen": -2.800121545791626, - "logits/rejected": -2.839855670928955, - "logps/chosen": -279.0640869140625, - "logps/rejected": -373.6163635253906, - "loss": 0.0063, + "logits/chosen": -2.6884918212890625, + "logits/rejected": -2.7006702423095703, + "logps/chosen": -283.17572021484375, + "logps/rejected": -336.9147033691406, + "loss": 0.0614, "rewards/accuracies": 1.0, - "rewards/chosen": 1.0177298784255981, - "rewards/margins": 9.800603866577148, - "rewards/rejected": -8.782873153686523, + "rewards/chosen": 0.606569230556488, + "rewards/margins": 12.021093368530273, + "rewards/rejected": -11.414522171020508, "step": 3980 }, { "epoch": 2.06, "learning_rate": 1.7412507171543314e-07, - "logits/chosen": -2.786940813064575, - "logits/rejected": -2.836458683013916, - "logps/chosen": -194.54067993164062, - "logps/rejected": -314.28802490234375, - "loss": 0.0119, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.3166208267211914, - "rewards/margins": 9.230585098266602, - "rewards/rejected": -8.913965225219727, + "logits/chosen": -2.6824710369110107, + "logits/rejected": -2.6804637908935547, + "logps/chosen": -197.56112670898438, + "logps/rejected": -293.1640319824219, + "loss": 0.0485, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014252111315727234, + "rewards/margins": 11.204924583435059, + "rewards/rejected": -11.190672874450684, "step": 3990 }, { "epoch": 2.07, "learning_rate": 1.7316886593995028e-07, - "logits/chosen": -2.807687282562256, - "logits/rejected": -2.8598742485046387, - "logps/chosen": -218.19580078125, - "logps/rejected": -344.6866149902344, - "loss": 0.0168, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6459993720054626, - "rewards/margins": 8.975092887878418, - "rewards/rejected": -8.329092025756836, + "logits/chosen": -2.705667495727539, + "logits/rejected": -2.723625659942627, + "logps/chosen": -218.71951293945312, + "logps/rejected": -320.50372314453125, + "loss": 0.0272, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.5935962796211243, + "rewards/margins": 10.673410415649414, + "rewards/rejected": -10.079813003540039, "step": 4000 }, { "epoch": 2.07, - "eval_logits/chosen": -2.777517795562744, - "eval_logits/rejected": -2.841887950897217, - "eval_logps/chosen": -257.9146423339844, - "eval_logps/rejected": -337.38818359375, - "eval_loss": 0.5259261727333069, - "eval_rewards/accuracies": 0.8059999942779541, - "eval_rewards/chosen": -1.0234500169754028, - "eval_rewards/margins": 5.101625442504883, - "eval_rewards/rejected": -6.1250762939453125, - "eval_runtime": 278.3659, - "eval_samples_per_second": 7.185, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.67600154876709, + "eval_logits/rejected": -2.7034356594085693, + "eval_logps/chosen": -255.6702880859375, + "eval_logps/rejected": -311.6160583496094, + "eval_loss": 0.3653969168663025, + "eval_rewards/accuracies": 0.8700000047683716, + "eval_rewards/chosen": -0.7855775952339172, + "eval_rewards/margins": 7.802643775939941, + "eval_rewards/rejected": -8.588221549987793, + "eval_runtime": 276.2754, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 0.452, "step": 4000 }, { "epoch": 2.07, "learning_rate": 1.722126601644674e-07, - "logits/chosen": -2.800914764404297, - "logits/rejected": -2.830235719680786, - "logps/chosen": -248.7406768798828, - "logps/rejected": -330.05267333984375, - "loss": 0.0173, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9889074563980103, - "rewards/margins": 9.021321296691895, - "rewards/rejected": -8.032414436340332, + "logits/chosen": -2.6867549419403076, + "logits/rejected": -2.679992198944092, + "logps/chosen": -249.64657592773438, + "logps/rejected": -299.2137756347656, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8983162045478821, + "rewards/margins": 10.576483726501465, + "rewards/rejected": -9.678167343139648, "step": 4010 }, { "epoch": 2.08, "learning_rate": 1.7125645438898452e-07, - "logits/chosen": -2.737515687942505, - "logits/rejected": -2.819586753845215, - "logps/chosen": -243.9829559326172, - "logps/rejected": -350.6963195800781, - "loss": 0.0071, + "logits/chosen": -2.6230149269104004, + "logits/rejected": -2.6642093658447266, + "logps/chosen": -245.2910614013672, + "logps/rejected": -329.4732971191406, + "loss": 0.0054, "rewards/accuracies": 1.0, - "rewards/chosen": 0.6946113705635071, - "rewards/margins": 9.870268821716309, - "rewards/rejected": -9.17565631866455, + "rewards/chosen": 0.5638024210929871, + "rewards/margins": 11.941469192504883, + "rewards/rejected": -11.377667427062988, "step": 4020 }, { "epoch": 2.08, "learning_rate": 1.703002486135016e-07, - "logits/chosen": -2.7942397594451904, - "logits/rejected": -2.8367092609405518, - "logps/chosen": -281.71917724609375, - "logps/rejected": -360.45782470703125, - "loss": 0.0081, + "logits/chosen": -2.684929370880127, + "logits/rejected": -2.6629865169525146, + "logps/chosen": -290.6525573730469, + "logps/rejected": -317.08526611328125, + "loss": 0.0076, "rewards/accuracies": 1.0, - "rewards/chosen": 1.3287776708602905, - "rewards/margins": 10.280738830566406, - "rewards/rejected": -8.951960563659668, + "rewards/chosen": 1.050879716873169, + "rewards/margins": 11.867934226989746, + "rewards/rejected": -10.81705379486084, "step": 4030 }, { "epoch": 2.09, "learning_rate": 1.6934404283801872e-07, - "logits/chosen": -2.7896275520324707, - "logits/rejected": -2.77767014503479, - "logps/chosen": -238.7395477294922, - "logps/rejected": -421.7259826660156, - "loss": 0.0079, + "logits/chosen": -2.6639273166656494, + "logits/rejected": -2.6345295906066895, + "logps/chosen": -241.4361114501953, + "logps/rejected": -379.60772705078125, + "loss": 0.0077, "rewards/accuracies": 1.0, - "rewards/chosen": 0.795906662940979, - "rewards/margins": 10.489312171936035, - "rewards/rejected": -9.693406105041504, + "rewards/chosen": 0.5262530446052551, + "rewards/margins": 11.457598686218262, + "rewards/rejected": -10.931344985961914, "step": 4040 }, { "epoch": 2.09, "learning_rate": 1.6838783706253584e-07, - "logits/chosen": -2.7938098907470703, - "logits/rejected": -2.836862564086914, - "logps/chosen": -192.1158447265625, - "logps/rejected": -331.1219787597656, - "loss": 0.0345, + "logits/chosen": -2.673530101776123, + "logits/rejected": -2.694524049758911, + "logps/chosen": -184.44485473632812, + "logps/rejected": -291.5581970214844, + "loss": 0.0167, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.6757541298866272, - "rewards/margins": 9.389961242675781, - "rewards/rejected": -8.714208602905273, + "rewards/chosen": 1.4424384832382202, + "rewards/margins": 11.678289413452148, + "rewards/rejected": -10.235852241516113, "step": 4050 }, { "epoch": 2.1, "learning_rate": 1.6743163128705295e-07, - "logits/chosen": -2.7857515811920166, - "logits/rejected": -2.8047680854797363, - "logps/chosen": -237.7971954345703, - "logps/rejected": -362.19476318359375, - "loss": 0.0105, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.536388099193573, - "rewards/margins": 10.030662536621094, - "rewards/rejected": -9.494275093078613, + "logits/chosen": -2.6608340740203857, + "logits/rejected": -2.6483817100524902, + "logps/chosen": -236.818359375, + "logps/rejected": -323.1213073730469, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6342740058898926, + "rewards/margins": 13.054033279418945, + "rewards/rejected": -12.419757843017578, "step": 4060 }, { "epoch": 2.1, "learning_rate": 1.664754255115701e-07, - "logits/chosen": -2.777409076690674, - "logits/rejected": -2.831268548965454, - "logps/chosen": -246.1458740234375, - "logps/rejected": -395.8076171875, - "loss": 0.0139, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.3308227062225342, - "rewards/margins": 10.749163627624512, - "rewards/rejected": -10.418339729309082, + "logits/chosen": -2.6585958003997803, + "logits/rejected": -2.6771240234375, + "logps/chosen": -243.3425750732422, + "logps/rejected": -322.475341796875, + "loss": 0.0067, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.611153244972229, + "rewards/margins": 12.21384334564209, + "rewards/rejected": -11.602689743041992, "step": 4070 }, { "epoch": 2.11, "learning_rate": 1.655192197360872e-07, - "logits/chosen": -2.858748197555542, - "logits/rejected": -2.882028102874756, - "logps/chosen": -250.802001953125, - "logps/rejected": -395.7652587890625, - "loss": 0.0194, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7184736132621765, - "rewards/margins": 10.270122528076172, - "rewards/rejected": -9.55164909362793, + "logits/chosen": -2.7432479858398438, + "logits/rejected": -2.740055561065674, + "logps/chosen": -247.479248046875, + "logps/rejected": -344.71771240234375, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0507471561431885, + "rewards/margins": 12.421725273132324, + "rewards/rejected": -11.370977401733398, "step": 4080 }, { "epoch": 2.11, "learning_rate": 1.6456301396060433e-07, - "logits/chosen": -2.810091018676758, - "logits/rejected": -2.8521618843078613, - "logps/chosen": -316.01104736328125, - "logps/rejected": -424.2666931152344, - "loss": 0.0078, + "logits/chosen": -2.6939754486083984, + "logits/rejected": -2.6896774768829346, + "logps/chosen": -312.4244689941406, + "logps/rejected": -374.0768127441406, + "loss": 0.0068, "rewards/accuracies": 1.0, - "rewards/chosen": 0.11161376535892487, - "rewards/margins": 10.254980087280273, - "rewards/rejected": -10.143366813659668, + "rewards/chosen": 0.8813197016716003, + "rewards/margins": 12.301496505737305, + "rewards/rejected": -11.42017650604248, "step": 4090 }, { "epoch": 2.12, "learning_rate": 1.6360680818512144e-07, - "logits/chosen": -2.7612602710723877, - "logits/rejected": -2.825305461883545, - "logps/chosen": -223.1158905029297, - "logps/rejected": -373.63433837890625, - "loss": 0.0114, + "logits/chosen": -2.641249418258667, + "logits/rejected": -2.680281162261963, + "logps/chosen": -222.09072875976562, + "logps/rejected": -309.78350830078125, + "loss": 0.0062, "rewards/accuracies": 1.0, - "rewards/chosen": 0.576636791229248, - "rewards/margins": 9.575021743774414, - "rewards/rejected": -8.998383522033691, + "rewards/chosen": 0.9023264646530151, + "rewards/margins": 11.617216110229492, + "rewards/rejected": -10.714888572692871, "step": 4100 }, { "epoch": 2.12, - "eval_logits/chosen": -2.7581655979156494, - "eval_logits/rejected": -2.824922800064087, - "eval_logps/chosen": -263.41705322265625, - "eval_logps/rejected": -346.3928527832031, - "eval_loss": 0.5714476108551025, - "eval_rewards/accuracies": 0.8140000104904175, - "eval_rewards/chosen": -1.573691725730896, - "eval_rewards/margins": 5.451850891113281, - "eval_rewards/rejected": -7.025542259216309, - "eval_runtime": 278.5529, - "eval_samples_per_second": 7.18, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.653895854949951, + "eval_logits/rejected": -2.68784761428833, + "eval_logps/chosen": -255.83551025390625, + "eval_logps/rejected": -318.1495666503906, + "eval_loss": 0.3840446174144745, + "eval_rewards/accuracies": 0.8679999709129333, + "eval_rewards/chosen": -0.8021018505096436, + "eval_rewards/margins": 8.439469337463379, + "eval_rewards/rejected": -9.241571426391602, + "eval_runtime": 276.3648, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 0.452, "step": 4100 }, { "epoch": 2.12, "learning_rate": 1.6265060240963853e-07, - "logits/chosen": -2.7999181747436523, - "logits/rejected": -2.851715326309204, - "logps/chosen": -264.3846435546875, - "logps/rejected": -366.8013610839844, - "loss": 0.0086, + "logits/chosen": -2.694314479827881, + "logits/rejected": -2.6954028606414795, + "logps/chosen": -261.2373352050781, + "logps/rejected": -355.5386047363281, + "loss": 0.0054, "rewards/accuracies": 1.0, - "rewards/chosen": 0.5434740781784058, - "rewards/margins": 9.045372009277344, - "rewards/rejected": -8.501897811889648, + "rewards/chosen": 0.8598831295967102, + "rewards/margins": 12.107415199279785, + "rewards/rejected": -11.24753189086914, "step": 4110 }, { "epoch": 2.13, "learning_rate": 1.6169439663415565e-07, - "logits/chosen": -2.8644258975982666, - "logits/rejected": -2.865626811981201, - "logps/chosen": -227.87234497070312, - "logps/rejected": -376.9765625, - "loss": 0.0066, + "logits/chosen": -2.751716375350952, + "logits/rejected": -2.706066370010376, + "logps/chosen": -224.96060180664062, + "logps/rejected": -319.9879150390625, + "loss": 0.0044, "rewards/accuracies": 1.0, - "rewards/chosen": 0.4827271103858948, - "rewards/margins": 10.994937896728516, - "rewards/rejected": -10.512212753295898, + "rewards/chosen": 0.7817245721817017, + "rewards/margins": 13.001078605651855, + "rewards/rejected": -12.219353675842285, "step": 4120 }, { "epoch": 2.13, "learning_rate": 1.6073819085867276e-07, - "logits/chosen": -2.8559463024139404, - "logits/rejected": -2.920194387435913, - "logps/chosen": -225.81591796875, - "logps/rejected": -457.36187744140625, - "loss": 0.0152, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.435596227645874, - "rewards/margins": 12.244255065917969, - "rewards/rejected": -10.808659553527832, + "logits/chosen": -2.7700047492980957, + "logits/rejected": -2.7698535919189453, + "logps/chosen": -228.0575408935547, + "logps/rejected": -394.62908935546875, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.413045048713684, + "rewards/margins": 13.316288948059082, + "rewards/rejected": -11.903242111206055, "step": 4130 }, { "epoch": 2.14, "learning_rate": 1.597819850831899e-07, - "logits/chosen": -2.7609105110168457, - "logits/rejected": -2.801273822784424, - "logps/chosen": -204.47128295898438, - "logps/rejected": -378.7325134277344, - "loss": 0.0045, + "logits/chosen": -2.665053367614746, + "logits/rejected": -2.6794962882995605, + "logps/chosen": -201.50245666503906, + "logps/rejected": -347.5719909667969, + "loss": 0.0197, "rewards/accuracies": 1.0, - "rewards/chosen": 0.7425202131271362, - "rewards/margins": 10.559002876281738, - "rewards/rejected": -9.816482543945312, + "rewards/chosen": 1.0391294956207275, + "rewards/margins": 12.938039779663086, + "rewards/rejected": -11.89891242980957, "step": 4140 }, { "epoch": 2.14, "learning_rate": 1.5882577930770702e-07, - "logits/chosen": -2.828613758087158, - "logits/rejected": -2.861415386199951, - "logps/chosen": -261.1982116699219, - "logps/rejected": -361.2552490234375, - "loss": 0.0108, + "logits/chosen": -2.7185938358306885, + "logits/rejected": -2.718862533569336, + "logps/chosen": -256.0380859375, + "logps/rejected": -323.9504699707031, + "loss": 0.0071, "rewards/accuracies": 1.0, - "rewards/chosen": -0.0972164124250412, - "rewards/margins": 9.664294242858887, - "rewards/rejected": -9.76151180267334, + "rewards/chosen": 0.41879814863204956, + "rewards/margins": 10.923070907592773, + "rewards/rejected": -10.5042724609375, "step": 4150 }, { "epoch": 2.15, "learning_rate": 1.5786957353222414e-07, - "logits/chosen": -2.8005096912384033, - "logits/rejected": -2.827037811279297, - "logps/chosen": -300.31878662109375, - "logps/rejected": -373.00482177734375, - "loss": 0.0092, + "logits/chosen": -2.705132007598877, + "logits/rejected": -2.7074081897735596, + "logps/chosen": -292.0003662109375, + "logps/rejected": -327.1040954589844, + "loss": 0.0166, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.3028424978256226, - "rewards/margins": 10.192333221435547, - "rewards/rejected": -8.889491081237793, + "rewards/chosen": 2.0681941509246826, + "rewards/margins": 13.091203689575195, + "rewards/rejected": -11.023008346557617, "step": 4160 }, { "epoch": 2.15, "learning_rate": 1.5691336775674125e-07, - "logits/chosen": -2.7364072799682617, - "logits/rejected": -2.775285482406616, - "logps/chosen": -246.3945770263672, - "logps/rejected": -368.32135009765625, - "loss": 0.0135, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5547359585762024, - "rewards/margins": 10.201997756958008, - "rewards/rejected": -9.647260665893555, + "logits/chosen": -2.641709566116333, + "logits/rejected": -2.633150339126587, + "logps/chosen": -240.72140502929688, + "logps/rejected": -315.7856140136719, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.122051477432251, + "rewards/margins": 12.476945877075195, + "rewards/rejected": -11.354894638061523, "step": 4170 }, { "epoch": 2.16, "learning_rate": 1.5595716198125837e-07, - "logits/chosen": -2.736896276473999, - "logits/rejected": -2.791792631149292, - "logps/chosen": -255.3802947998047, - "logps/rejected": -392.916748046875, - "loss": 0.0175, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.13744010031223297, - "rewards/margins": 10.613411903381348, - "rewards/rejected": -10.475973129272461, + "logits/chosen": -2.634437084197998, + "logits/rejected": -2.6612460613250732, + "logps/chosen": -251.1487579345703, + "logps/rejected": -352.4736633300781, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5599457621574402, + "rewards/margins": 12.866701126098633, + "rewards/rejected": -12.306755065917969, "step": 4180 }, { "epoch": 2.16, "learning_rate": 1.5500095620577546e-07, - "logits/chosen": -2.822958469390869, - "logits/rejected": -2.869006633758545, - "logps/chosen": -235.73965454101562, - "logps/rejected": -324.2436218261719, - "loss": 0.0186, + "logits/chosen": -2.730055093765259, + "logits/rejected": -2.71248197555542, + "logps/chosen": -235.14187622070312, + "logps/rejected": -292.95977783203125, + "loss": 0.0329, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.5705742239952087, - "rewards/margins": 10.561267852783203, - "rewards/rejected": -9.990694046020508, + "rewards/chosen": 0.6340131163597107, + "rewards/margins": 12.814358711242676, + "rewards/rejected": -12.180343627929688, "step": 4190 }, { "epoch": 2.17, "learning_rate": 1.5404475043029257e-07, - "logits/chosen": -2.7899680137634277, - "logits/rejected": -2.828990936279297, - "logps/chosen": -236.803466796875, - "logps/rejected": -358.914306640625, - "loss": 0.0114, + "logits/chosen": -2.665670871734619, + "logits/rejected": -2.670865297317505, + "logps/chosen": -228.3739776611328, + "logps/rejected": -325.34735107421875, + "loss": 0.0132, "rewards/accuracies": 1.0, - "rewards/chosen": -0.1756969392299652, - "rewards/margins": 10.331613540649414, - "rewards/rejected": -10.50731086730957, + "rewards/chosen": 0.6672468781471252, + "rewards/margins": 11.43543815612793, + "rewards/rejected": -10.76819133758545, "step": 4200 }, { "epoch": 2.17, - "eval_logits/chosen": -2.740879535675049, - "eval_logits/rejected": -2.8101789951324463, - "eval_logps/chosen": -265.96771240234375, - "eval_logps/rejected": -348.9774475097656, - "eval_loss": 0.5547088384628296, - "eval_rewards/accuracies": 0.8019999861717224, - "eval_rewards/chosen": -1.8287551403045654, - "eval_rewards/margins": 5.455246448516846, - "eval_rewards/rejected": -7.284001350402832, - "eval_runtime": 279.0068, - "eval_samples_per_second": 7.168, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.6286206245422363, + "eval_logits/rejected": -2.636244297027588, + "eval_logps/chosen": -254.5709991455078, + "eval_logps/rejected": -315.15350341796875, + "eval_loss": 0.3860289454460144, + "eval_rewards/accuracies": 0.8659999966621399, + "eval_rewards/chosen": -0.6756497621536255, + "eval_rewards/margins": 8.266318321228027, + "eval_rewards/rejected": -8.941967964172363, + "eval_runtime": 276.6403, + "eval_samples_per_second": 7.23, + "eval_steps_per_second": 0.452, "step": 4200 }, { "epoch": 2.17, "learning_rate": 1.5308854465480971e-07, - "logits/chosen": -2.7795817852020264, - "logits/rejected": -2.8416852951049805, - "logps/chosen": -246.7899627685547, - "logps/rejected": -322.27886962890625, - "loss": 0.014, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.713182806968689, - "rewards/margins": 9.562291145324707, - "rewards/rejected": -8.849108695983887, + "logits/chosen": -2.6831214427948, + "logits/rejected": -2.6651432514190674, + "logps/chosen": -242.5078582763672, + "logps/rejected": -261.32537841796875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1406787633895874, + "rewards/margins": 11.71851634979248, + "rewards/rejected": -10.577837944030762, "step": 4210 }, { "epoch": 2.18, "learning_rate": 1.5213233887932683e-07, - "logits/chosen": -2.7340545654296875, - "logits/rejected": -2.7741708755493164, - "logps/chosen": -282.47357177734375, - "logps/rejected": -369.46942138671875, - "loss": 0.0072, + "logits/chosen": -2.641648769378662, + "logits/rejected": -2.6230263710021973, + "logps/chosen": -274.18603515625, + "logps/rejected": -307.90802001953125, + "loss": 0.0074, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.5383526682853699, - "rewards/margins": 10.111784934997559, - "rewards/rejected": -9.573432922363281, + "rewards/chosen": 1.356532335281372, + "rewards/margins": 12.154073715209961, + "rewards/rejected": -10.797542572021484, "step": 4220 }, { "epoch": 2.18, "learning_rate": 1.5117613310384395e-07, - "logits/chosen": -2.7639198303222656, - "logits/rejected": -2.794893741607666, - "logps/chosen": -273.2432556152344, - "logps/rejected": -379.93414306640625, - "loss": 0.0096, + "logits/chosen": -2.6659140586853027, + "logits/rejected": -2.621009349822998, + "logps/chosen": -268.6670227050781, + "logps/rejected": -336.909912109375, + "loss": 0.0076, "rewards/accuracies": 1.0, - "rewards/chosen": -0.12266747653484344, - "rewards/margins": 11.038806915283203, - "rewards/rejected": -11.161474227905273, + "rewards/chosen": 0.34331053495407104, + "rewards/margins": 12.903196334838867, + "rewards/rejected": -12.559886932373047, "step": 4230 }, { "epoch": 2.19, "learning_rate": 1.5021992732836106e-07, - "logits/chosen": -2.776571750640869, - "logits/rejected": -2.8270602226257324, - "logps/chosen": -253.249755859375, - "logps/rejected": -416.44195556640625, - "loss": 0.0172, + "logits/chosen": -2.6648082733154297, + "logits/rejected": -2.6570217609405518, + "logps/chosen": -246.05319213867188, + "logps/rejected": -385.14117431640625, + "loss": 0.0126, "rewards/accuracies": 1.0, - "rewards/chosen": -0.12639644742012024, - "rewards/margins": 10.179259300231934, - "rewards/rejected": -10.305657386779785, + "rewards/chosen": 0.5953577160835266, + "rewards/margins": 11.963868141174316, + "rewards/rejected": -11.368510246276855, "step": 4240 }, { "epoch": 2.19, "learning_rate": 1.4926372155287818e-07, - "logits/chosen": -2.804246187210083, - "logits/rejected": -2.8505008220672607, - "logps/chosen": -224.5175018310547, - "logps/rejected": -390.603515625, - "loss": 0.0146, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.19647911190986633, - "rewards/margins": 11.195769309997559, - "rewards/rejected": -10.999292373657227, + "logits/chosen": -2.674042224884033, + "logits/rejected": -2.6660823822021484, + "logps/chosen": -218.59765625, + "logps/rejected": -328.94610595703125, + "loss": 0.0088, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7884891629219055, + "rewards/margins": 13.258936882019043, + "rewards/rejected": -12.470447540283203, "step": 4250 }, { "epoch": 2.2, "learning_rate": 1.483075157773953e-07, - "logits/chosen": -2.811629056930542, - "logits/rejected": -2.8690712451934814, - "logps/chosen": -249.8781280517578, - "logps/rejected": -350.0668640136719, - "loss": 0.0316, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5592334866523743, - "rewards/margins": 10.175796508789062, - "rewards/rejected": -9.616562843322754, + "logits/chosen": -2.6634860038757324, + "logits/rejected": -2.672638416290283, + "logps/chosen": -251.62234497070312, + "logps/rejected": -331.580078125, + "loss": 0.0142, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2184207439422607, + "rewards/margins": 12.689657211303711, + "rewards/rejected": -11.471237182617188, "step": 4260 }, { "epoch": 2.2, "learning_rate": 1.4735131000191238e-07, - "logits/chosen": -2.768264055252075, - "logits/rejected": -2.839480400085449, - "logps/chosen": -203.34783935546875, - "logps/rejected": -307.28948974609375, - "loss": 0.0149, + "logits/chosen": -2.5975325107574463, + "logits/rejected": -2.6272525787353516, + "logps/chosen": -196.06265258789062, + "logps/rejected": -288.2975769042969, + "loss": 0.0086, "rewards/accuracies": 1.0, - "rewards/chosen": 0.11550579220056534, - "rewards/margins": 10.141157150268555, - "rewards/rejected": -10.025650978088379, + "rewards/chosen": 0.8440225720405579, + "rewards/margins": 11.858992576599121, + "rewards/rejected": -11.014969825744629, "step": 4270 }, { "epoch": 2.21, "learning_rate": 1.4639510422642952e-07, - "logits/chosen": -2.7731645107269287, - "logits/rejected": -2.837803602218628, - "logps/chosen": -183.10386657714844, - "logps/rejected": -331.3976745605469, - "loss": 0.0174, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4855395257472992, - "rewards/margins": 9.625699996948242, - "rewards/rejected": -9.140159606933594, + "logits/chosen": -2.6247355937957764, + "logits/rejected": -2.618931293487549, + "logps/chosen": -183.3592071533203, + "logps/rejected": -311.09686279296875, + "loss": 0.0101, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6141112446784973, + "rewards/margins": 11.920515060424805, + "rewards/rejected": -11.306404113769531, "step": 4280 }, { "epoch": 2.21, "learning_rate": 1.4543889845094664e-07, - "logits/chosen": -2.802046537399292, - "logits/rejected": -2.8697285652160645, - "logps/chosen": -333.67962646484375, - "logps/rejected": -438.260009765625, - "loss": 0.0111, + "logits/chosen": -2.625828981399536, + "logits/rejected": -2.641679286956787, + "logps/chosen": -335.74981689453125, + "logps/rejected": -373.028076171875, + "loss": 0.0275, "rewards/accuracies": 1.0, - "rewards/chosen": 0.7267153859138489, - "rewards/margins": 11.459893226623535, - "rewards/rejected": -10.733177185058594, + "rewards/chosen": 0.5228278040885925, + "rewards/margins": 13.451436996459961, + "rewards/rejected": -12.928608894348145, "step": 4290 }, { "epoch": 2.22, "learning_rate": 1.4448269267546376e-07, - "logits/chosen": -2.827871084213257, - "logits/rejected": -2.8422415256500244, - "logps/chosen": -322.83551025390625, - "logps/rejected": -417.2867736816406, - "loss": 0.0482, - "rewards/accuracies": 1.0, - "rewards/chosen": 2.1672213077545166, - "rewards/margins": 12.557757377624512, - "rewards/rejected": -10.390536308288574, + "logits/chosen": -2.6415860652923584, + "logits/rejected": -2.613152027130127, + "logps/chosen": -329.01190185546875, + "logps/rejected": -367.13531494140625, + "loss": 0.0073, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1660537719726562, + "rewards/margins": 14.009121894836426, + "rewards/rejected": -11.843066215515137, "step": 4300 }, { "epoch": 2.22, - "eval_logits/chosen": -2.787360429763794, - "eval_logits/rejected": -2.851278305053711, - "eval_logps/chosen": -259.26263427734375, - "eval_logps/rejected": -340.8786315917969, - "eval_loss": 0.5436837077140808, - "eval_rewards/accuracies": 0.8140000104904175, - "eval_rewards/chosen": -1.1582494974136353, - "eval_rewards/margins": 5.31587028503418, - "eval_rewards/rejected": -6.474120140075684, - "eval_runtime": 278.6515, - "eval_samples_per_second": 7.177, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.644742012023926, + "eval_logits/rejected": -2.6569817066192627, + "eval_logps/chosen": -254.68019104003906, + "eval_logps/rejected": -319.19805908203125, + "eval_loss": 0.40645551681518555, + "eval_rewards/accuracies": 0.8700000047683716, + "eval_rewards/chosen": -0.6865688562393188, + "eval_rewards/margins": 8.659854888916016, + "eval_rewards/rejected": -9.346423149108887, + "eval_runtime": 276.6971, + "eval_samples_per_second": 7.228, + "eval_steps_per_second": 0.452, "step": 4300 }, { "epoch": 2.23, "learning_rate": 1.4352648689998087e-07, - "logits/chosen": -2.843700647354126, - "logits/rejected": -2.867222309112549, - "logps/chosen": -245.06906127929688, - "logps/rejected": -332.5545959472656, - "loss": 0.0094, + "logits/chosen": -2.7026374340057373, + "logits/rejected": -2.6729674339294434, + "logps/chosen": -243.1049346923828, + "logps/rejected": -301.85467529296875, + "loss": 0.0048, "rewards/accuracies": 1.0, - "rewards/chosen": 1.0607502460479736, - "rewards/margins": 10.345534324645996, - "rewards/rejected": -9.284785270690918, + "rewards/chosen": 1.25716233253479, + "rewards/margins": 13.114236831665039, + "rewards/rejected": -11.857072830200195, "step": 4310 }, { "epoch": 2.23, "learning_rate": 1.42570281124498e-07, - "logits/chosen": -2.7883362770080566, - "logits/rejected": -2.8541436195373535, - "logps/chosen": -284.0835876464844, - "logps/rejected": -381.5480041503906, - "loss": 0.0185, + "logits/chosen": -2.646242380142212, + "logits/rejected": -2.677393913269043, + "logps/chosen": -281.4183044433594, + "logps/rejected": -357.06402587890625, + "loss": 0.0111, "rewards/accuracies": 1.0, - "rewards/chosen": 1.4265800714492798, - "rewards/margins": 10.460817337036133, - "rewards/rejected": -9.0342378616333, + "rewards/chosen": 1.6915756464004517, + "rewards/margins": 12.357884407043457, + "rewards/rejected": -10.666309356689453, "step": 4320 }, { "epoch": 2.24, "learning_rate": 1.416140753490151e-07, - "logits/chosen": -2.7982468605041504, - "logits/rejected": -2.883110523223877, - "logps/chosen": -281.6730041503906, - "logps/rejected": -414.79962158203125, - "loss": 0.0074, + "logits/chosen": -2.655795097351074, + "logits/rejected": -2.694797992706299, + "logps/chosen": -277.06414794921875, + "logps/rejected": -387.6266784667969, + "loss": 0.0054, "rewards/accuracies": 1.0, - "rewards/chosen": 0.39604753255844116, - "rewards/margins": 11.227680206298828, - "rewards/rejected": -10.831633567810059, + "rewards/chosen": 0.8446704745292664, + "rewards/margins": 13.449702262878418, + "rewards/rejected": -12.605031967163086, "step": 4330 }, { "epoch": 2.24, "learning_rate": 1.4065786957353222e-07, - "logits/chosen": -2.815957546234131, - "logits/rejected": -2.8724205493927, - "logps/chosen": -259.5505065917969, - "logps/rejected": -376.6618347167969, - "loss": 0.0147, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 1.3497107028961182, - "rewards/margins": 10.574721336364746, - "rewards/rejected": -9.225010871887207, + "logits/chosen": -2.6699233055114746, + "logits/rejected": -2.6946558952331543, + "logps/chosen": -252.34262084960938, + "logps/rejected": -347.46337890625, + "loss": 0.0102, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 2.0705008506774902, + "rewards/margins": 13.38475227355957, + "rewards/rejected": -11.314249992370605, "step": 4340 }, { "epoch": 2.25, "learning_rate": 1.3970166379804933e-07, - "logits/chosen": -2.804654598236084, - "logits/rejected": -2.8554275035858154, - "logps/chosen": -284.3072509765625, - "logps/rejected": -373.55084228515625, - "loss": 0.0092, + "logits/chosen": -2.675903558731079, + "logits/rejected": -2.6673693656921387, + "logps/chosen": -278.6595153808594, + "logps/rejected": -353.07073974609375, + "loss": 0.0067, "rewards/accuracies": 1.0, - "rewards/chosen": 0.5603520274162292, - "rewards/margins": 10.00316333770752, - "rewards/rejected": -9.442811012268066, + "rewards/chosen": 1.1251275539398193, + "rewards/margins": 12.58167552947998, + "rewards/rejected": -11.456548690795898, "step": 4350 }, { "epoch": 2.25, "learning_rate": 1.3874545802256645e-07, - "logits/chosen": -2.8638744354248047, - "logits/rejected": -2.8915674686431885, - "logps/chosen": -279.210205078125, - "logps/rejected": -344.73651123046875, - "loss": 0.0103, + "logits/chosen": -2.729840040206909, + "logits/rejected": -2.7106354236602783, + "logps/chosen": -285.21563720703125, + "logps/rejected": -299.8715515136719, + "loss": 0.0041, "rewards/accuracies": 1.0, - "rewards/chosen": 0.7013729810714722, - "rewards/margins": 10.824358940124512, - "rewards/rejected": -10.12298583984375, + "rewards/chosen": 0.8291403651237488, + "rewards/margins": 13.70796012878418, + "rewards/rejected": -12.878822326660156, "step": 4360 }, { "epoch": 2.26, "learning_rate": 1.3778925224708357e-07, - "logits/chosen": -2.74699330329895, - "logits/rejected": -2.7918925285339355, - "logps/chosen": -247.57296752929688, - "logps/rejected": -379.8360900878906, - "loss": 0.016, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.507546067237854, - "rewards/margins": 10.973848342895508, - "rewards/rejected": -10.466302871704102, + "logits/chosen": -2.5956339836120605, + "logits/rejected": -2.60099196434021, + "logps/chosen": -243.3638458251953, + "logps/rejected": -350.39398193359375, + "loss": 0.0161, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9282540082931519, + "rewards/margins": 12.694877624511719, + "rewards/rejected": -11.766622543334961, "step": 4370 }, { "epoch": 2.26, "learning_rate": 1.3683304647160068e-07, - "logits/chosen": -2.755742311477661, - "logits/rejected": -2.8171803951263428, - "logps/chosen": -208.4476776123047, - "logps/rejected": -373.80621337890625, - "loss": 0.011, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6287239193916321, - "rewards/margins": 10.667415618896484, - "rewards/rejected": -10.038691520690918, + "logits/chosen": -2.631574869155884, + "logits/rejected": -2.632842540740967, + "logps/chosen": -199.08560180664062, + "logps/rejected": -339.928955078125, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5621168613433838, + "rewards/margins": 12.929792404174805, + "rewards/rejected": -11.36767578125, "step": 4380 }, { "epoch": 2.27, "learning_rate": 1.358768406961178e-07, - "logits/chosen": -2.808208703994751, - "logits/rejected": -2.864919662475586, - "logps/chosen": -214.84634399414062, - "logps/rejected": -361.06890869140625, - "loss": 0.0084, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.06784908473491669, - "rewards/margins": 10.550009727478027, - "rewards/rejected": -10.482160568237305, + "logits/chosen": -2.6779558658599854, + "logits/rejected": -2.7034175395965576, + "logps/chosen": -208.13150024414062, + "logps/rejected": -315.4166259765625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7393345832824707, + "rewards/margins": 12.127341270446777, + "rewards/rejected": -11.388006210327148, "step": 4390 }, { "epoch": 2.27, "learning_rate": 1.349206349206349e-07, - "logits/chosen": -2.832120180130005, - "logits/rejected": -2.8752362728118896, - "logps/chosen": -162.49362182617188, - "logps/rejected": -339.9632568359375, - "loss": 0.0172, + "logits/chosen": -2.720271348953247, + "logits/rejected": -2.6917262077331543, + "logps/chosen": -161.9430389404297, + "logps/rejected": -322.6776428222656, + "loss": 0.0061, "rewards/accuracies": 1.0, - "rewards/chosen": 0.02236497402191162, - "rewards/margins": 11.080262184143066, - "rewards/rejected": -11.057897567749023, + "rewards/chosen": 0.07742256671190262, + "rewards/margins": 12.233587265014648, + "rewards/rejected": -12.156164169311523, "step": 4400 }, { "epoch": 2.27, - "eval_logits/chosen": -2.7835676670074463, - "eval_logits/rejected": -2.847372531890869, - "eval_logps/chosen": -263.640869140625, - "eval_logps/rejected": -347.76019287109375, - "eval_loss": 0.5489197969436646, - "eval_rewards/accuracies": 0.8100000023841858, - "eval_rewards/chosen": -1.5960688591003418, - "eval_rewards/margins": 5.56620979309082, - "eval_rewards/rejected": -7.16227912902832, - "eval_runtime": 278.2686, - "eval_samples_per_second": 7.187, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.6548290252685547, + "eval_logits/rejected": -2.6599998474121094, + "eval_logps/chosen": -251.37091064453125, + "eval_logps/rejected": -316.03179931640625, + "eval_loss": 0.3912397623062134, + "eval_rewards/accuracies": 0.8619999885559082, + "eval_rewards/chosen": -0.35564109683036804, + "eval_rewards/margins": 8.674158096313477, + "eval_rewards/rejected": -9.02979850769043, + "eval_runtime": 276.498, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 0.452, "step": 4400 }, { "epoch": 2.28, "learning_rate": 1.3396442914515203e-07, - "logits/chosen": -2.7538981437683105, - "logits/rejected": -2.797692060470581, - "logps/chosen": -267.57086181640625, - "logps/rejected": -371.95587158203125, - "loss": 0.0128, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.38313159346580505, - "rewards/margins": 9.685579299926758, - "rewards/rejected": -10.068711280822754, + "logits/chosen": -2.6260523796081543, + "logits/rejected": -2.613741397857666, + "logps/chosen": -263.2807312011719, + "logps/rejected": -310.61669921875, + "loss": 0.018, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.49878963828086853, + "rewards/margins": 12.43281364440918, + "rewards/rejected": -11.934021949768066, "step": 4410 }, { "epoch": 2.28, "learning_rate": 1.3300822336966917e-07, - "logits/chosen": -2.8126959800720215, - "logits/rejected": -2.7908413410186768, - "logps/chosen": -300.4286193847656, - "logps/rejected": -410.2023010253906, - "loss": 0.0085, + "logits/chosen": -2.6836166381835938, + "logits/rejected": -2.5941810607910156, + "logps/chosen": -295.9386291503906, + "logps/rejected": -371.14892578125, + "loss": 0.0031, "rewards/accuracies": 1.0, - "rewards/chosen": 1.0054407119750977, - "rewards/margins": 11.864087104797363, - "rewards/rejected": -10.85864543914795, + "rewards/chosen": 1.4565682411193848, + "rewards/margins": 13.721742630004883, + "rewards/rejected": -12.265172958374023, "step": 4420 }, { "epoch": 2.29, "learning_rate": 1.3205201759418626e-07, - "logits/chosen": -2.6961915493011475, - "logits/rejected": -2.730517625808716, - "logps/chosen": -290.7415771484375, - "logps/rejected": -405.54718017578125, - "loss": 0.0083, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.3425076007843018, - "rewards/margins": 10.741376876831055, - "rewards/rejected": -9.398869514465332, + "logits/chosen": -2.581943988800049, + "logits/rejected": -2.5792744159698486, + "logps/chosen": -291.35723876953125, + "logps/rejected": -355.77484130859375, + "loss": 0.0148, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.281065583229065, + "rewards/margins": 11.515872955322266, + "rewards/rejected": -10.234807014465332, "step": 4430 }, { "epoch": 2.29, "learning_rate": 1.3109581181870338e-07, - "logits/chosen": -2.8597958087921143, - "logits/rejected": -2.8856372833251953, - "logps/chosen": -278.34490966796875, - "logps/rejected": -371.2419738769531, - "loss": 0.0125, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 1.2198222875595093, - "rewards/margins": 10.515420913696289, - "rewards/rejected": -9.295598983764648, + "logits/chosen": -2.766364336013794, + "logits/rejected": -2.707728385925293, + "logps/chosen": -274.43304443359375, + "logps/rejected": -326.385009765625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6144554615020752, + "rewards/margins": 12.849863052368164, + "rewards/rejected": -11.235407829284668, "step": 4440 }, { "epoch": 2.3, "learning_rate": 1.301396060432205e-07, - "logits/chosen": -2.7863852977752686, - "logits/rejected": -2.8537216186523438, - "logps/chosen": -253.5373992919922, - "logps/rejected": -363.5780334472656, - "loss": 0.0068, + "logits/chosen": -2.6954822540283203, + "logits/rejected": -2.6646182537078857, + "logps/chosen": -248.5667266845703, + "logps/rejected": -290.41302490234375, + "loss": 0.0047, "rewards/accuracies": 1.0, - "rewards/chosen": 0.5466651916503906, - "rewards/margins": 11.057966232299805, - "rewards/rejected": -10.51130199432373, + "rewards/chosen": 1.0439783334732056, + "rewards/margins": 12.735447883605957, + "rewards/rejected": -11.691468238830566, "step": 4450 }, { "epoch": 2.3, "learning_rate": 1.291834002677376e-07, - "logits/chosen": -2.7333171367645264, - "logits/rejected": -2.7991576194763184, - "logps/chosen": -222.57424926757812, - "logps/rejected": -377.4979248046875, - "loss": 0.0097, + "logits/chosen": -2.6302859783172607, + "logits/rejected": -2.647365093231201, + "logps/chosen": -213.2334747314453, + "logps/rejected": -316.852294921875, + "loss": 0.0063, "rewards/accuracies": 1.0, - "rewards/chosen": 0.21776556968688965, - "rewards/margins": 10.281082153320312, - "rewards/rejected": -10.063316345214844, + "rewards/chosen": 1.1518433094024658, + "rewards/margins": 12.262166023254395, + "rewards/rejected": -11.110322952270508, "step": 4460 }, { "epoch": 2.31, "learning_rate": 1.2822719449225472e-07, - "logits/chosen": -2.763277530670166, - "logits/rejected": -2.8399055004119873, - "logps/chosen": -170.37014770507812, - "logps/rejected": -340.293212890625, - "loss": 0.0093, + "logits/chosen": -2.649827480316162, + "logits/rejected": -2.700962543487549, + "logps/chosen": -160.63975524902344, + "logps/rejected": -332.7588195800781, + "loss": 0.0034, "rewards/accuracies": 1.0, - "rewards/chosen": -0.284454345703125, - "rewards/margins": 9.632649421691895, - "rewards/rejected": -9.91710376739502, + "rewards/chosen": 0.688583254814148, + "rewards/margins": 11.438365936279297, + "rewards/rejected": -10.749783515930176, "step": 4470 }, { "epoch": 2.31, "learning_rate": 1.2727098871677184e-07, - "logits/chosen": -2.8263633251190186, - "logits/rejected": -2.8704276084899902, - "logps/chosen": -315.4873962402344, - "logps/rejected": -437.9556579589844, - "loss": 0.0037, + "logits/chosen": -2.7301270961761475, + "logits/rejected": -2.704988956451416, + "logps/chosen": -313.9676208496094, + "logps/rejected": -355.34130859375, + "loss": 0.002, "rewards/accuracies": 1.0, - "rewards/chosen": 0.6770858764648438, - "rewards/margins": 11.242297172546387, - "rewards/rejected": -10.565211296081543, + "rewards/chosen": 1.0236303806304932, + "rewards/margins": 13.098129272460938, + "rewards/rejected": -12.074499130249023, "step": 4480 }, { "epoch": 2.32, "learning_rate": 1.2631478294128898e-07, - "logits/chosen": -2.8527493476867676, - "logits/rejected": -2.885178565979004, - "logps/chosen": -229.99075317382812, - "logps/rejected": -385.3793029785156, - "loss": 0.0463, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.27444297075271606, - "rewards/margins": 10.513134956359863, - "rewards/rejected": -10.238691329956055, + "logits/chosen": -2.749298572540283, + "logits/rejected": -2.7684879302978516, + "logps/chosen": -229.85488891601562, + "logps/rejected": -355.70208740234375, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9541438817977905, + "rewards/margins": 12.498308181762695, + "rewards/rejected": -11.544164657592773, "step": 4490 }, { "epoch": 2.32, "learning_rate": 1.253585771658061e-07, - "logits/chosen": -2.831853151321411, - "logits/rejected": -2.8576908111572266, - "logps/chosen": -238.5697021484375, - "logps/rejected": -388.0924377441406, - "loss": 0.1044, + "logits/chosen": -2.7163774967193604, + "logits/rejected": -2.725437879562378, + "logps/chosen": -234.7379150390625, + "logps/rejected": -354.0361633300781, + "loss": 0.0159, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.5936632752418518, - "rewards/margins": 10.974578857421875, - "rewards/rejected": -10.380915641784668, + "rewards/chosen": 0.9780641794204712, + "rewards/margins": 11.38829517364502, + "rewards/rejected": -10.41023063659668, "step": 4500 }, { "epoch": 2.32, - "eval_logits/chosen": -2.783907890319824, - "eval_logits/rejected": -2.848212957382202, - "eval_logps/chosen": -266.22772216796875, - "eval_logps/rejected": -353.6324768066406, - "eval_loss": 0.5818387269973755, - "eval_rewards/accuracies": 0.8140000104904175, - "eval_rewards/chosen": -1.8547568321228027, - "eval_rewards/margins": 5.894747734069824, - "eval_rewards/rejected": -7.749504089355469, - "eval_runtime": 278.5575, - "eval_samples_per_second": 7.18, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.6765902042388916, + "eval_logits/rejected": -2.6957597732543945, + "eval_logps/chosen": -251.01815795898438, + "eval_logps/rejected": -312.6424560546875, + "eval_loss": 0.38638532161712646, + "eval_rewards/accuracies": 0.8619999885559082, + "eval_rewards/chosen": -0.3203662931919098, + "eval_rewards/margins": 8.370494842529297, + "eval_rewards/rejected": -8.690861701965332, + "eval_runtime": 276.1066, + "eval_samples_per_second": 7.244, + "eval_steps_per_second": 0.453, "step": 4500 }, { "epoch": 2.33, "learning_rate": 1.2440237139032319e-07, - "logits/chosen": -2.8777260780334473, - "logits/rejected": -2.902764320373535, - "logps/chosen": -277.8800964355469, - "logps/rejected": -350.5680236816406, - "loss": 0.0124, + "logits/chosen": -2.7625091075897217, + "logits/rejected": -2.7434208393096924, + "logps/chosen": -272.5645446777344, + "logps/rejected": -302.9764709472656, + "loss": 0.0027, "rewards/accuracies": 1.0, - "rewards/chosen": 0.32009559869766235, - "rewards/margins": 10.575444221496582, - "rewards/rejected": -10.255348205566406, + "rewards/chosen": 0.8495186567306519, + "rewards/margins": 12.835386276245117, + "rewards/rejected": -11.985868453979492, "step": 4510 }, { "epoch": 2.33, "learning_rate": 1.234461656148403e-07, - "logits/chosen": -2.8503713607788086, - "logits/rejected": -2.90812087059021, - "logps/chosen": -303.7490539550781, - "logps/rejected": -406.6163635253906, - "loss": 0.0069, + "logits/chosen": -2.733023166656494, + "logits/rejected": -2.7103774547576904, + "logps/chosen": -294.3748779296875, + "logps/rejected": -347.6996765136719, + "loss": 0.0075, "rewards/accuracies": 1.0, - "rewards/chosen": -0.04932591691613197, - "rewards/margins": 11.66883659362793, - "rewards/rejected": -11.718160629272461, + "rewards/chosen": 0.8830663561820984, + "rewards/margins": 13.676523208618164, + "rewards/rejected": -12.793456077575684, "step": 4520 }, { "epoch": 2.34, "learning_rate": 1.2248995983935742e-07, - "logits/chosen": -2.844027042388916, - "logits/rejected": -2.8921597003936768, - "logps/chosen": -255.0517120361328, - "logps/rejected": -374.0168151855469, - "loss": 0.0047, + "logits/chosen": -2.7622809410095215, + "logits/rejected": -2.762237310409546, + "logps/chosen": -253.2561492919922, + "logps/rejected": -333.795166015625, + "loss": 0.0037, "rewards/accuracies": 1.0, - "rewards/chosen": 1.9114421606063843, - "rewards/margins": 11.845464706420898, - "rewards/rejected": -9.934022903442383, + "rewards/chosen": 2.833845853805542, + "rewards/margins": 13.008646011352539, + "rewards/rejected": -10.174800872802734, "step": 4530 }, { "epoch": 2.34, "learning_rate": 1.2153375406387456e-07, - "logits/chosen": -2.7948267459869385, - "logits/rejected": -2.8465352058410645, - "logps/chosen": -280.3050231933594, - "logps/rejected": -407.7957458496094, - "loss": 0.0053, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.20190688967704773, - "rewards/margins": 10.695255279541016, - "rewards/rejected": -10.493348121643066, + "logits/chosen": -2.684333324432373, + "logits/rejected": -2.712156295776367, + "logps/chosen": -265.5671691894531, + "logps/rejected": -375.2867736816406, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6756912469863892, + "rewards/margins": 12.839948654174805, + "rewards/rejected": -11.16425609588623, "step": 4540 }, { "epoch": 2.35, "learning_rate": 1.2057754828839165e-07, - "logits/chosen": -2.823227643966675, - "logits/rejected": -2.910917282104492, - "logps/chosen": -234.4201202392578, - "logps/rejected": -369.2624206542969, - "loss": 0.009, + "logits/chosen": -2.714963436126709, + "logits/rejected": -2.751513957977295, + "logps/chosen": -233.6080322265625, + "logps/rejected": -309.8179626464844, + "loss": 0.008, "rewards/accuracies": 1.0, - "rewards/chosen": 0.06653478741645813, - "rewards/margins": 10.437435150146484, - "rewards/rejected": -10.37090015411377, + "rewards/chosen": 0.6140552163124084, + "rewards/margins": 11.839889526367188, + "rewards/rejected": -11.22583293914795, "step": 4550 }, { "epoch": 2.35, "learning_rate": 1.1962134251290876e-07, - "logits/chosen": -2.8018596172332764, - "logits/rejected": -2.8438680171966553, - "logps/chosen": -262.4857482910156, - "logps/rejected": -337.6358337402344, - "loss": 0.017, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.25648975372314453, - "rewards/margins": 10.115443229675293, - "rewards/rejected": -10.371932983398438, + "logits/chosen": -2.6816468238830566, + "logits/rejected": -2.6856436729431152, + "logps/chosen": -256.4266052246094, + "logps/rejected": -302.91351318359375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.34942373633384705, + "rewards/margins": 11.370274543762207, + "rewards/rejected": -11.020849227905273, "step": 4560 }, { "epoch": 2.36, "learning_rate": 1.1866513673742588e-07, - "logits/chosen": -2.783787488937378, - "logits/rejected": -2.8495638370513916, - "logps/chosen": -283.16571044921875, - "logps/rejected": -342.12945556640625, - "loss": 0.0208, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.38057252764701843, - "rewards/margins": 10.06275749206543, - "rewards/rejected": -9.682184219360352, + "logits/chosen": -2.6626667976379395, + "logits/rejected": -2.6787123680114746, + "logps/chosen": -276.158447265625, + "logps/rejected": -276.16571044921875, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0911033153533936, + "rewards/margins": 12.42508316040039, + "rewards/rejected": -11.33398151397705, "step": 4570 }, { "epoch": 2.36, "learning_rate": 1.1770893096194301e-07, - "logits/chosen": -2.7712864875793457, - "logits/rejected": -2.8020853996276855, - "logps/chosen": -262.7274475097656, - "logps/rejected": -374.31524658203125, - "loss": 0.0094, + "logits/chosen": -2.6180026531219482, + "logits/rejected": -2.602674961090088, + "logps/chosen": -253.6972198486328, + "logps/rejected": -347.20819091796875, + "loss": 0.0047, "rewards/accuracies": 1.0, - "rewards/chosen": 1.127746343612671, - "rewards/margins": 11.753847122192383, - "rewards/rejected": -10.62610149383545, + "rewards/chosen": 2.031054973602295, + "rewards/margins": 13.9181547164917, + "rewards/rejected": -11.887101173400879, "step": 4580 }, { "epoch": 2.37, "learning_rate": 1.1675272518646012e-07, - "logits/chosen": -2.834930896759033, - "logits/rejected": -2.8962676525115967, - "logps/chosen": -204.0745086669922, - "logps/rejected": -364.3519592285156, - "loss": 0.0257, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.20914089679718018, - "rewards/margins": 9.767343521118164, - "rewards/rejected": -9.558202743530273, + "logits/chosen": -2.7086029052734375, + "logits/rejected": -2.693084716796875, + "logps/chosen": -200.48143005371094, + "logps/rejected": -322.3230285644531, + "loss": 0.0132, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.569089412689209, + "rewards/margins": 11.423630714416504, + "rewards/rejected": -10.854541778564453, "step": 4590 }, { "epoch": 2.37, "learning_rate": 1.1579651941097724e-07, - "logits/chosen": -2.8558883666992188, - "logits/rejected": -2.9146106243133545, - "logps/chosen": -232.75039672851562, - "logps/rejected": -366.9052429199219, - "loss": 0.012, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.9267519116401672, - "rewards/margins": 10.588906288146973, - "rewards/rejected": -9.662155151367188, + "logits/chosen": -2.7474679946899414, + "logits/rejected": -2.7574124336242676, + "logps/chosen": -228.36807250976562, + "logps/rejected": -341.8480529785156, + "loss": 0.0065, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3649846315383911, + "rewards/margins": 12.356697082519531, + "rewards/rejected": -10.99171257019043, "step": 4600 }, { "epoch": 2.37, - "eval_logits/chosen": -2.7866322994232178, - "eval_logits/rejected": -2.8511645793914795, - "eval_logps/chosen": -264.5919189453125, - "eval_logps/rejected": -351.72418212890625, - "eval_loss": 0.5813160538673401, - "eval_rewards/accuracies": 0.8159999847412109, - "eval_rewards/chosen": -1.6911762952804565, - "eval_rewards/margins": 5.867499828338623, - "eval_rewards/rejected": -7.558675765991211, - "eval_runtime": 278.7682, - "eval_samples_per_second": 7.174, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.6636369228363037, + "eval_logits/rejected": -2.680604934692383, + "eval_logps/chosen": -256.1195983886719, + "eval_logps/rejected": -320.3636779785156, + "eval_loss": 0.41182947158813477, + "eval_rewards/accuracies": 0.8579999804496765, + "eval_rewards/chosen": -0.830512285232544, + "eval_rewards/margins": 8.632473945617676, + "eval_rewards/rejected": -9.46298599243164, + "eval_runtime": 276.5512, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 0.452, "step": 4600 }, { "epoch": 2.38, "learning_rate": 1.1484031363549436e-07, - "logits/chosen": -2.7684288024902344, - "logits/rejected": -2.804677963256836, - "logps/chosen": -320.72589111328125, - "logps/rejected": -374.26654052734375, - "loss": 0.0165, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.35492923855781555, - "rewards/margins": 10.75482177734375, - "rewards/rejected": -10.399892807006836, + "logits/chosen": -2.6278457641601562, + "logits/rejected": -2.611564874649048, + "logps/chosen": -319.51898193359375, + "logps/rejected": -350.20623779296875, + "loss": 0.0093, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4739806652069092, + "rewards/margins": 13.100465774536133, + "rewards/rejected": -12.626484870910645, "step": 4610 }, { "epoch": 2.39, "learning_rate": 1.1388410786001147e-07, - "logits/chosen": -2.8026232719421387, - "logits/rejected": -2.85943341255188, - "logps/chosen": -211.3720703125, - "logps/rejected": -388.8536071777344, - "loss": 0.0091, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.15674147009849548, - "rewards/margins": 10.096702575683594, - "rewards/rejected": -10.253443717956543, + "logits/chosen": -2.6990890502929688, + "logits/rejected": -2.696523427963257, + "logps/chosen": -205.5595703125, + "logps/rejected": -345.6148681640625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4238893389701843, + "rewards/margins": 12.81701946258545, + "rewards/rejected": -12.393131256103516, "step": 4620 }, { "epoch": 2.39, "learning_rate": 1.1292790208452859e-07, - "logits/chosen": -2.840369939804077, - "logits/rejected": -2.833326816558838, - "logps/chosen": -233.71926879882812, - "logps/rejected": -393.7159118652344, - "loss": 0.0118, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.027848612517118454, - "rewards/margins": 11.55732250213623, - "rewards/rejected": -11.585172653198242, + "logits/chosen": -2.7160212993621826, + "logits/rejected": -2.6591362953186035, + "logps/chosen": -235.47659301757812, + "logps/rejected": -296.60040283203125, + "loss": 0.015, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.20147009193897247, + "rewards/margins": 12.931154251098633, + "rewards/rejected": -13.132623672485352, "step": 4630 }, { "epoch": 2.4, "learning_rate": 1.119716963090457e-07, - "logits/chosen": -2.794414758682251, - "logits/rejected": -2.8389816284179688, - "logps/chosen": -240.9445343017578, - "logps/rejected": -396.8078308105469, - "loss": 0.0102, + "logits/chosen": -2.669713020324707, + "logits/rejected": -2.67938232421875, + "logps/chosen": -236.20968627929688, + "logps/rejected": -367.20367431640625, + "loss": 0.0065, "rewards/accuracies": 1.0, - "rewards/chosen": -0.32970330119132996, - "rewards/margins": 11.215761184692383, - "rewards/rejected": -11.545463562011719, + "rewards/chosen": 0.4860232472419739, + "rewards/margins": 12.139914512634277, + "rewards/rejected": -11.653892517089844, "step": 4640 }, { "epoch": 2.4, "learning_rate": 1.1101549053356282e-07, - "logits/chosen": -2.7997448444366455, - "logits/rejected": -2.8671469688415527, - "logps/chosen": -243.86978149414062, - "logps/rejected": -387.15045166015625, - "loss": 0.0168, + "logits/chosen": -2.686720848083496, + "logits/rejected": -2.7255923748016357, + "logps/chosen": -240.42526245117188, + "logps/rejected": -359.03656005859375, + "loss": 0.0132, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.43554702401161194, - "rewards/margins": 11.55484390258789, - "rewards/rejected": -11.11929702758789, + "rewards/chosen": 0.7799973487854004, + "rewards/margins": 13.230825424194336, + "rewards/rejected": -12.45082950592041, "step": 4650 }, { "epoch": 2.41, "learning_rate": 1.1005928475807993e-07, - "logits/chosen": -2.7641167640686035, - "logits/rejected": -2.822042942047119, - "logps/chosen": -204.0389862060547, - "logps/rejected": -362.5557556152344, - "loss": 0.0083, + "logits/chosen": -2.677825927734375, + "logits/rejected": -2.683408498764038, + "logps/chosen": -198.4245147705078, + "logps/rejected": -312.18310546875, + "loss": 0.0052, "rewards/accuracies": 1.0, - "rewards/chosen": 0.1727055460214615, - "rewards/margins": 11.18034553527832, - "rewards/rejected": -11.007640838623047, + "rewards/chosen": 0.7352830171585083, + "rewards/margins": 12.68535327911377, + "rewards/rejected": -11.95007038116455, "step": 4660 }, { "epoch": 2.41, "learning_rate": 1.0910307898259705e-07, - "logits/chosen": -2.724806308746338, - "logits/rejected": -2.793595314025879, - "logps/chosen": -233.08151245117188, - "logps/rejected": -413.00274658203125, - "loss": 0.0154, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.1520412564277649, - "rewards/margins": 10.408720016479492, - "rewards/rejected": -10.560762405395508, + "logits/chosen": -2.6178271770477295, + "logits/rejected": -2.6650643348693848, + "logps/chosen": -225.069091796875, + "logps/rejected": -374.01434326171875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6539960503578186, + "rewards/margins": 13.32505989074707, + "rewards/rejected": -12.671061515808105, "step": 4670 }, { "epoch": 2.42, "learning_rate": 1.0814687320711418e-07, - "logits/chosen": -2.6983354091644287, - "logits/rejected": -2.76259446144104, - "logps/chosen": -199.3309326171875, - "logps/rejected": -420.94757080078125, - "loss": 0.0096, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.28074705600738525, - "rewards/margins": 11.806534767150879, - "rewards/rejected": -11.525787353515625, + "logits/chosen": -2.581111192703247, + "logits/rejected": -2.6189424991607666, + "logps/chosen": -192.6226348876953, + "logps/rejected": -392.97650146484375, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.958675742149353, + "rewards/margins": 14.340494155883789, + "rewards/rejected": -13.381817817687988, "step": 4680 }, { "epoch": 2.42, "learning_rate": 1.0719066743163128e-07, - "logits/chosen": -2.7496988773345947, - "logits/rejected": -2.8320717811584473, - "logps/chosen": -302.5816345214844, - "logps/rejected": -407.7145690917969, - "loss": 0.0166, + "logits/chosen": -2.6409318447113037, + "logits/rejected": -2.6935670375823975, + "logps/chosen": -289.792724609375, + "logps/rejected": -373.4590148925781, + "loss": 0.0092, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.16056537628173828, - "rewards/margins": 11.588438034057617, - "rewards/rejected": -11.749003410339355, + "rewards/chosen": 1.1182327270507812, + "rewards/margins": 14.061304092407227, + "rewards/rejected": -12.943069458007812, "step": 4690 }, { "epoch": 2.43, "learning_rate": 1.062344616561484e-07, - "logits/chosen": -2.8104662895202637, - "logits/rejected": -2.8494856357574463, - "logps/chosen": -302.8609313964844, - "logps/rejected": -400.6754150390625, - "loss": 0.0122, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.10702119022607803, - "rewards/margins": 12.200922966003418, - "rewards/rejected": -12.093901634216309, + "logits/chosen": -2.691370725631714, + "logits/rejected": -2.6682136058807373, + "logps/chosen": -308.5107421875, + "logps/rejected": -319.19122314453125, + "loss": 0.0095, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3439258933067322, + "rewards/margins": 12.584671020507812, + "rewards/rejected": -12.240745544433594, "step": 4700 }, { "epoch": 2.43, - "eval_logits/chosen": -2.75579571723938, - "eval_logits/rejected": -2.8210208415985107, - "eval_logps/chosen": -270.0638732910156, - "eval_logps/rejected": -359.8251647949219, - "eval_loss": 0.605195164680481, - "eval_rewards/accuracies": 0.8059999942779541, - "eval_rewards/chosen": -2.238370180130005, - "eval_rewards/margins": 6.130407810211182, - "eval_rewards/rejected": -8.36877727508545, - "eval_runtime": 278.3561, - "eval_samples_per_second": 7.185, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.6588754653930664, + "eval_logits/rejected": -2.6817219257354736, + "eval_logps/chosen": -257.84149169921875, + "eval_logps/rejected": -324.0401916503906, + "eval_loss": 0.4204564690589905, + "eval_rewards/accuracies": 0.8659999966621399, + "eval_rewards/chosen": -1.0026999711990356, + "eval_rewards/margins": 8.827935218811035, + "eval_rewards/rejected": -9.830636024475098, + "eval_runtime": 276.052, + "eval_samples_per_second": 7.245, + "eval_steps_per_second": 0.453, "step": 4700 }, { "epoch": 2.43, "learning_rate": 1.0527825588066551e-07, - "logits/chosen": -2.7616004943847656, - "logits/rejected": -2.8404548168182373, - "logps/chosen": -256.90576171875, - "logps/rejected": -385.58367919921875, - "loss": 0.009, + "logits/chosen": -2.6532177925109863, + "logits/rejected": -2.6817214488983154, + "logps/chosen": -255.25662231445312, + "logps/rejected": -332.6568298339844, + "loss": 0.0081, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.34221506118774414, - "rewards/margins": 12.297042846679688, - "rewards/rejected": -11.954826354980469, + "rewards/chosen": 0.504800021648407, + "rewards/margins": 13.3855562210083, + "rewards/rejected": -12.880755424499512, "step": 4710 }, { "epoch": 2.44, "learning_rate": 1.0432205010518264e-07, - "logits/chosen": -2.779740333557129, - "logits/rejected": -2.840048313140869, - "logps/chosen": -210.28982543945312, - "logps/rejected": -393.116943359375, - "loss": 0.0163, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.500928521156311, - "rewards/margins": 11.802443504333496, - "rewards/rejected": -11.301515579223633, + "logits/chosen": -2.6646459102630615, + "logits/rejected": -2.6903421878814697, + "logps/chosen": -211.12936401367188, + "logps/rejected": -361.8469543457031, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4170397222042084, + "rewards/margins": 13.32408618927002, + "rewards/rejected": -12.9070463180542, "step": 4720 }, { "epoch": 2.44, "learning_rate": 1.0336584432969974e-07, - "logits/chosen": -2.7029783725738525, - "logits/rejected": -2.738452434539795, - "logps/chosen": -209.94808959960938, - "logps/rejected": -359.03863525390625, - "loss": 0.0144, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.2978671193122864, - "rewards/margins": 9.817428588867188, - "rewards/rejected": -9.519559860229492, + "logits/chosen": -2.5830578804016113, + "logits/rejected": -2.5983011722564697, + "logps/chosen": -204.2769012451172, + "logps/rejected": -357.91290283203125, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8649846911430359, + "rewards/margins": 12.186356544494629, + "rewards/rejected": -11.321372032165527, "step": 4730 }, { "epoch": 2.45, "learning_rate": 1.0240963855421686e-07, - "logits/chosen": -2.6916146278381348, - "logits/rejected": -2.7506518363952637, - "logps/chosen": -329.70587158203125, - "logps/rejected": -386.9439697265625, - "loss": 0.0069, + "logits/chosen": -2.5975985527038574, + "logits/rejected": -2.5991950035095215, + "logps/chosen": -317.145751953125, + "logps/rejected": -341.62738037109375, + "loss": 0.0149, "rewards/accuracies": 1.0, - "rewards/chosen": 0.8479889035224915, - "rewards/margins": 10.828804016113281, - "rewards/rejected": -9.980814933776855, + "rewards/chosen": 2.106783390045166, + "rewards/margins": 14.040043830871582, + "rewards/rejected": -11.933259010314941, "step": 4740 }, { "epoch": 2.45, "learning_rate": 1.0145343277873399e-07, - "logits/chosen": -2.7057933807373047, - "logits/rejected": -2.7978127002716064, - "logps/chosen": -277.19329833984375, - "logps/rejected": -393.6683044433594, - "loss": 0.0159, + "logits/chosen": -2.6228525638580322, + "logits/rejected": -2.6971497535705566, + "logps/chosen": -273.76397705078125, + "logps/rejected": -351.47015380859375, + "loss": 0.009, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.7689875364303589, - "rewards/margins": 11.254928588867188, - "rewards/rejected": -10.485939979553223, + "rewards/chosen": 1.111918568611145, + "rewards/margins": 13.509257316589355, + "rewards/rejected": -12.3973388671875, "step": 4750 }, { "epoch": 2.46, "learning_rate": 1.004972270032511e-07, - "logits/chosen": -2.6612842082977295, - "logits/rejected": -2.6679983139038086, - "logps/chosen": -275.8064270019531, - "logps/rejected": -327.3836364746094, - "loss": 0.0302, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": -0.3787831664085388, - "rewards/margins": 10.829161643981934, - "rewards/rejected": -11.207944869995117, + "logits/chosen": -2.5505127906799316, + "logits/rejected": -2.493907928466797, + "logps/chosen": -266.0143737792969, + "logps/rejected": -310.218017578125, + "loss": 0.0079, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6014013886451721, + "rewards/margins": 12.798852920532227, + "rewards/rejected": -12.197453498840332, "step": 4760 }, { "epoch": 2.46, "learning_rate": 9.95410212277682e-08, - "logits/chosen": -2.767195463180542, - "logits/rejected": -2.8072774410247803, - "logps/chosen": -275.5713806152344, - "logps/rejected": -398.08782958984375, - "loss": 0.0115, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.25600147247314453, - "rewards/margins": 10.520998001098633, - "rewards/rejected": -10.77700138092041, + "logits/chosen": -2.6540799140930176, + "logits/rejected": -2.6570849418640137, + "logps/chosen": -278.6947326660156, + "logps/rejected": -355.62322998046875, + "loss": 0.0083, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.042123548686504364, + "rewards/margins": 12.820783615112305, + "rewards/rejected": -12.778658866882324, "step": 4770 }, { "epoch": 2.47, "learning_rate": 9.858481545228532e-08, - "logits/chosen": -2.7905125617980957, - "logits/rejected": -2.7901828289031982, - "logps/chosen": -246.63400268554688, - "logps/rejected": -346.6756896972656, - "loss": 0.0129, + "logits/chosen": -2.7023508548736572, + "logits/rejected": -2.6539154052734375, + "logps/chosen": -245.067138671875, + "logps/rejected": -322.9728698730469, + "loss": 0.0107, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.0959916040301323, - "rewards/margins": 10.186857223510742, - "rewards/rejected": -10.28284740447998, + "rewards/chosen": 0.06069324538111687, + "rewards/margins": 13.387222290039062, + "rewards/rejected": -13.326528549194336, "step": 4780 }, { "epoch": 2.47, "learning_rate": 9.762860967680245e-08, - "logits/chosen": -2.765639543533325, - "logits/rejected": -2.750558614730835, - "logps/chosen": -260.526123046875, - "logps/rejected": -423.7069396972656, - "loss": 0.0094, + "logits/chosen": -2.653693914413452, + "logits/rejected": -2.6088497638702393, + "logps/chosen": -251.1782684326172, + "logps/rejected": -379.19708251953125, + "loss": 0.0059, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.6589454412460327, - "rewards/margins": 11.203248977661133, - "rewards/rejected": -11.86219596862793, + "rewards/chosen": 0.2645092010498047, + "rewards/margins": 13.2977294921875, + "rewards/rejected": -13.033220291137695, "step": 4790 }, { "epoch": 2.48, "learning_rate": 9.667240390131957e-08, - "logits/chosen": -2.8430063724517822, - "logits/rejected": -2.862946033477783, - "logps/chosen": -288.14813232421875, - "logps/rejected": -384.7672119140625, - "loss": 0.0636, + "logits/chosen": -2.7612671852111816, + "logits/rejected": -2.720489263534546, + "logps/chosen": -285.9808654785156, + "logps/rejected": -322.31939697265625, + "loss": 0.0345, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6992109417915344, - "rewards/margins": 11.617467880249023, - "rewards/rejected": -10.918257713317871, + "rewards/chosen": 0.9154788851737976, + "rewards/margins": 14.567514419555664, + "rewards/rejected": -13.6520357131958, "step": 4800 }, { "epoch": 2.48, - "eval_logits/chosen": -2.779690742492676, - "eval_logits/rejected": -2.8455140590667725, - "eval_logps/chosen": -266.16302490234375, - "eval_logps/rejected": -353.9501953125, - "eval_loss": 0.5866954326629639, - "eval_rewards/accuracies": 0.8140000104904175, - "eval_rewards/chosen": -1.848286747932434, - "eval_rewards/margins": 5.932989120483398, - "eval_rewards/rejected": -7.781275272369385, - "eval_runtime": 278.9143, - "eval_samples_per_second": 7.171, - "eval_steps_per_second": 0.448, + "eval_logits/chosen": -2.7051503658294678, + "eval_logits/rejected": -2.731257677078247, + "eval_logps/chosen": -258.7745056152344, + "eval_logps/rejected": -326.2886047363281, + "eval_loss": 0.42061492800712585, + "eval_rewards/accuracies": 0.8679999709129333, + "eval_rewards/chosen": -1.0960007905960083, + "eval_rewards/margins": 8.959474563598633, + "eval_rewards/rejected": -10.055475234985352, + "eval_runtime": 276.6764, + "eval_samples_per_second": 7.229, + "eval_steps_per_second": 0.452, "step": 4800 }, { "epoch": 2.48, "learning_rate": 9.571619812583667e-08, - "logits/chosen": -2.7728781700134277, - "logits/rejected": -2.819183349609375, - "logps/chosen": -268.3041076660156, - "logps/rejected": -418.9088439941406, - "loss": 0.0139, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.47220858931541443, - "rewards/margins": 12.578226089477539, - "rewards/rejected": -12.10601806640625, + "logits/chosen": -2.665557861328125, + "logits/rejected": -2.692908525466919, + "logps/chosen": -265.09759521484375, + "logps/rejected": -375.97637939453125, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7927976846694946, + "rewards/margins": 13.853170394897461, + "rewards/rejected": -13.06037425994873, "step": 4810 }, { "epoch": 2.49, "learning_rate": 9.47599923503538e-08, - "logits/chosen": -2.7985777854919434, - "logits/rejected": -2.8246827125549316, - "logps/chosen": -248.9866485595703, - "logps/rejected": -388.82525634765625, - "loss": 0.0209, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.21932640671730042, - "rewards/margins": 10.778423309326172, - "rewards/rejected": -10.55909538269043, + "logits/chosen": -2.708007574081421, + "logits/rejected": -2.7174456119537354, + "logps/chosen": -248.9668426513672, + "logps/rejected": -361.8161926269531, + "loss": 0.014, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22111478447914124, + "rewards/margins": 13.243840217590332, + "rewards/rejected": -13.022726058959961, "step": 4820 }, { "epoch": 2.49, "learning_rate": 9.380378657487091e-08, - "logits/chosen": -2.8599352836608887, - "logits/rejected": -2.8889236450195312, - "logps/chosen": -281.4698486328125, - "logps/rejected": -385.79595947265625, - "loss": 0.0155, + "logits/chosen": -2.7467713356018066, + "logits/rejected": -2.7494068145751953, + "logps/chosen": -286.0825500488281, + "logps/rejected": -329.0849609375, + "loss": 0.008, "rewards/accuracies": 1.0, - "rewards/chosen": 0.5898095965385437, - "rewards/margins": 12.167850494384766, - "rewards/rejected": -11.578041076660156, + "rewards/chosen": 0.1292753517627716, + "rewards/margins": 14.239352226257324, + "rewards/rejected": -14.110076904296875, "step": 4830 }, { "epoch": 2.5, "learning_rate": 9.284758079938803e-08, - "logits/chosen": -2.807657241821289, - "logits/rejected": -2.8112683296203613, - "logps/chosen": -304.69256591796875, - "logps/rejected": -374.6174621582031, - "loss": 0.0071, + "logits/chosen": -2.6901822090148926, + "logits/rejected": -2.6681206226348877, + "logps/chosen": -305.8731384277344, + "logps/rejected": -339.563720703125, + "loss": 0.0056, "rewards/accuracies": 1.0, - "rewards/chosen": 0.626765787601471, - "rewards/margins": 11.146921157836914, - "rewards/rejected": -10.520155906677246, + "rewards/chosen": 0.5087541937828064, + "rewards/margins": 13.043899536132812, + "rewards/rejected": -12.53514289855957, "step": 4840 }, { "epoch": 2.5, "learning_rate": 9.189137502390513e-08, - "logits/chosen": -2.855536699295044, - "logits/rejected": -2.8657050132751465, - "logps/chosen": -270.238037109375, - "logps/rejected": -440.15814208984375, - "loss": 0.0059, + "logits/chosen": -2.7424557209014893, + "logits/rejected": -2.735781192779541, + "logps/chosen": -267.27056884765625, + "logps/rejected": -405.3484802246094, + "loss": 0.0047, "rewards/accuracies": 1.0, - "rewards/chosen": -0.2958551049232483, - "rewards/margins": 10.37101936340332, - "rewards/rejected": -10.66687297821045, + "rewards/chosen": 0.0006778210517950356, + "rewards/margins": 12.662663459777832, + "rewards/rejected": -12.661985397338867, "step": 4850 }, { "epoch": 2.51, "learning_rate": 9.093516924842226e-08, - "logits/chosen": -2.802277088165283, - "logits/rejected": -2.852307081222534, - "logps/chosen": -238.21304321289062, - "logps/rejected": -415.5626525878906, - "loss": 0.0102, + "logits/chosen": -2.6823434829711914, + "logits/rejected": -2.69766902923584, + "logps/chosen": -230.4172821044922, + "logps/rejected": -358.4924011230469, + "loss": 0.0082, "rewards/accuracies": 1.0, - "rewards/chosen": 0.5378857254981995, - "rewards/margins": 11.27935791015625, - "rewards/rejected": -10.741472244262695, + "rewards/chosen": 1.318101167678833, + "rewards/margins": 13.956260681152344, + "rewards/rejected": -12.638158798217773, "step": 4860 }, { "epoch": 2.51, "learning_rate": 8.997896347293938e-08, - "logits/chosen": -2.8001160621643066, - "logits/rejected": -2.879955768585205, - "logps/chosen": -195.65481567382812, - "logps/rejected": -427.76470947265625, - "loss": 0.0141, - "rewards/accuracies": 1.0, - "rewards/chosen": 1.1122699975967407, - "rewards/margins": 11.812494277954102, - "rewards/rejected": -10.700222969055176, + "logits/chosen": -2.678046464920044, + "logits/rejected": -2.7416224479675293, + "logps/chosen": -191.58053588867188, + "logps/rejected": -402.94818115234375, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5196973085403442, + "rewards/margins": 14.120944023132324, + "rewards/rejected": -12.601248741149902, "step": 4870 }, { "epoch": 2.52, "learning_rate": 8.902275769745648e-08, - "logits/chosen": -2.8066625595092773, - "logits/rejected": -2.8681557178497314, - "logps/chosen": -243.08468627929688, - "logps/rejected": -397.8512268066406, - "loss": 0.0119, + "logits/chosen": -2.6824116706848145, + "logits/rejected": -2.698320150375366, + "logps/chosen": -248.81399536132812, + "logps/rejected": -346.9815979003906, + "loss": 0.0086, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.1370861530303955, - "rewards/margins": 11.573083877563477, - "rewards/rejected": -11.710169792175293, + "rewards/chosen": -0.2214881181716919, + "rewards/margins": 13.749554634094238, + "rewards/rejected": -13.971043586730957, "step": 4880 }, { "epoch": 2.52, "learning_rate": 8.806655192197361e-08, - "logits/chosen": -2.777631998062134, - "logits/rejected": -2.854902744293213, - "logps/chosen": -234.71383666992188, - "logps/rejected": -398.0288391113281, - "loss": 0.0107, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.32972007989883423, - "rewards/margins": 11.04564094543457, - "rewards/rejected": -11.375360488891602, + "logits/chosen": -2.6688122749328613, + "logits/rejected": -2.691349506378174, + "logps/chosen": -227.3494415283203, + "logps/rejected": -379.195556640625, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4076913893222809, + "rewards/margins": 13.870794296264648, + "rewards/rejected": -13.463102340698242, "step": 4890 }, { "epoch": 2.53, "learning_rate": 8.711034614649072e-08, - "logits/chosen": -2.842499256134033, - "logits/rejected": -2.846536874771118, - "logps/chosen": -236.6642303466797, - "logps/rejected": -412.7945861816406, - "loss": 0.0125, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.5801752805709839, - "rewards/margins": 12.802286148071289, - "rewards/rejected": -12.2221097946167, + "logits/chosen": -2.728708505630493, + "logits/rejected": -2.6870615482330322, + "logps/chosen": -237.6988983154297, + "logps/rejected": -381.16015625, + "loss": 0.013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.47214269638061523, + "rewards/margins": 13.449788093566895, + "rewards/rejected": -12.977645874023438, "step": 4900 }, { "epoch": 2.53, - "eval_logits/chosen": -2.768664598464966, - "eval_logits/rejected": -2.8342463970184326, - "eval_logps/chosen": -266.76190185546875, - "eval_logps/rejected": -354.1345520019531, - "eval_loss": 0.5878357291221619, - "eval_rewards/accuracies": 0.8140000104904175, - "eval_rewards/chosen": -1.9081742763519287, - "eval_rewards/margins": 5.891535758972168, - "eval_rewards/rejected": -7.799710273742676, - "eval_runtime": 278.4252, - "eval_samples_per_second": 7.183, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.682142496109009, + "eval_logits/rejected": -2.704385757446289, + "eval_logps/chosen": -259.3948974609375, + "eval_logps/rejected": -327.4449768066406, + "eval_loss": 0.42360028624534607, + "eval_rewards/accuracies": 0.8679999709129333, + "eval_rewards/chosen": -1.1580402851104736, + "eval_rewards/margins": 9.013073921203613, + "eval_rewards/rejected": -10.171113967895508, + "eval_runtime": 276.2516, + "eval_samples_per_second": 7.24, + "eval_steps_per_second": 0.452, "step": 4900 }, { "epoch": 2.53, "learning_rate": 8.615414037100784e-08, - "logits/chosen": -2.744393825531006, - "logits/rejected": -2.792158603668213, - "logps/chosen": -311.90576171875, - "logps/rejected": -381.0210266113281, - "loss": 0.0069, + "logits/chosen": -2.6353044509887695, + "logits/rejected": -2.6509835720062256, + "logps/chosen": -311.1354675292969, + "logps/rejected": -330.7841491699219, + "loss": 0.0082, "rewards/accuracies": 1.0, - "rewards/chosen": 0.9963443875312805, - "rewards/margins": 11.411710739135742, - "rewards/rejected": -10.415367126464844, + "rewards/chosen": 1.0729786157608032, + "rewards/margins": 14.09166431427002, + "rewards/rejected": -13.01868724822998, "step": 4910 }, { "epoch": 2.54, "learning_rate": 8.519793459552494e-08, - "logits/chosen": -2.7664597034454346, - "logits/rejected": -2.8189749717712402, - "logps/chosen": -216.3651123046875, - "logps/rejected": -382.20184326171875, - "loss": 0.0215, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7522561550140381, - "rewards/margins": 11.970437049865723, - "rewards/rejected": -11.218182563781738, + "logits/chosen": -2.662177324295044, + "logits/rejected": -2.6865832805633545, + "logps/chosen": -219.2774658203125, + "logps/rejected": -347.94830322265625, + "loss": 0.0143, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7228763699531555, + "rewards/margins": 13.378216743469238, + "rewards/rejected": -12.655339241027832, "step": 4920 }, { "epoch": 2.55, "learning_rate": 8.424172882004207e-08, - "logits/chosen": -2.766000270843506, - "logits/rejected": -2.815253257751465, - "logps/chosen": -247.57177734375, - "logps/rejected": -414.02117919921875, - "loss": 0.0223, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.35223907232284546, - "rewards/margins": 9.96684455871582, - "rewards/rejected": -9.614606857299805, + "logits/chosen": -2.6949350833892822, + "logits/rejected": -2.715196132659912, + "logps/chosen": -246.2075653076172, + "logps/rejected": -375.4911193847656, + "loss": 0.046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.48857760429382324, + "rewards/margins": 12.561260223388672, + "rewards/rejected": -12.072683334350586, "step": 4930 }, { "epoch": 2.55, "learning_rate": 8.328552304455919e-08, - "logits/chosen": -2.8126516342163086, - "logits/rejected": -2.8359436988830566, - "logps/chosen": -246.8765106201172, - "logps/rejected": -343.9100036621094, - "loss": 0.0139, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.468789666891098, - "rewards/margins": 10.000219345092773, - "rewards/rejected": -9.5314302444458, + "logits/chosen": -2.7643871307373047, + "logits/rejected": -2.756699562072754, + "logps/chosen": -242.24911499023438, + "logps/rejected": -320.3516845703125, + "loss": 0.0185, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.9298922419548035, + "rewards/margins": 12.572754859924316, + "rewards/rejected": -11.642863273620605, "step": 4940 }, { "epoch": 2.56, "learning_rate": 8.23293172690763e-08, - "logits/chosen": -2.7989373207092285, - "logits/rejected": -2.830984115600586, - "logps/chosen": -271.40362548828125, - "logps/rejected": -391.43890380859375, - "loss": 0.0066, + "logits/chosen": -2.7400996685028076, + "logits/rejected": -2.718656063079834, + "logps/chosen": -261.52459716796875, + "logps/rejected": -340.1265869140625, + "loss": 0.0033, "rewards/accuracies": 1.0, - "rewards/chosen": 0.7524517178535461, - "rewards/margins": 11.619732856750488, - "rewards/rejected": -10.867280960083008, + "rewards/chosen": 1.7511522769927979, + "rewards/margins": 13.349233627319336, + "rewards/rejected": -11.598082542419434, "step": 4950 }, { "epoch": 2.56, "learning_rate": 8.137311149359343e-08, - "logits/chosen": -2.868978977203369, - "logits/rejected": -2.8965516090393066, - "logps/chosen": -297.94769287109375, - "logps/rejected": -374.5810241699219, - "loss": 0.0053, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.8119597434997559, - "rewards/margins": 11.822549819946289, - "rewards/rejected": -11.010589599609375, + "logits/chosen": -2.8236403465270996, + "logits/rejected": -2.8367934226989746, + "logps/chosen": -289.75250244140625, + "logps/rejected": -373.33636474609375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6314811706542969, + "rewards/margins": 13.898635864257812, + "rewards/rejected": -12.267154693603516, "step": 4960 }, { "epoch": 2.57, "learning_rate": 8.041690571811053e-08, - "logits/chosen": -2.8357903957366943, - "logits/rejected": -2.8867337703704834, - "logps/chosen": -277.58050537109375, - "logps/rejected": -391.5204772949219, - "loss": 0.0156, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.00696375360712409, - "rewards/margins": 11.595417022705078, - "rewards/rejected": -11.602380752563477, + "logits/chosen": -2.7766637802124023, + "logits/rejected": -2.8046655654907227, + "logps/chosen": -275.01739501953125, + "logps/rejected": -352.0995178222656, + "loss": 0.0082, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6717353463172913, + "rewards/margins": 13.821104049682617, + "rewards/rejected": -13.149370193481445, "step": 4970 }, { "epoch": 2.57, "learning_rate": 7.946069994262765e-08, - "logits/chosen": -2.7701573371887207, - "logits/rejected": -2.7922308444976807, - "logps/chosen": -273.548583984375, - "logps/rejected": -363.3747863769531, - "loss": 0.0106, + "logits/chosen": -2.707808256149292, + "logits/rejected": -2.6958508491516113, + "logps/chosen": -268.88275146484375, + "logps/rejected": -331.9034118652344, + "loss": 0.0075, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.018690502271056175, - "rewards/margins": 9.98866081237793, - "rewards/rejected": -9.969969749450684, + "rewards/chosen": 0.4865795075893402, + "rewards/margins": 11.622472763061523, + "rewards/rejected": -11.135892868041992, "step": 4980 }, { "epoch": 2.58, "learning_rate": 7.850449416714476e-08, - "logits/chosen": -2.7947163581848145, - "logits/rejected": -2.8349592685699463, - "logps/chosen": -271.5372009277344, - "logps/rejected": -382.420654296875, - "loss": 0.0075, + "logits/chosen": -2.7567055225372314, + "logits/rejected": -2.7784535884857178, + "logps/chosen": -265.5184020996094, + "logps/rejected": -355.8439025878906, + "loss": 0.0057, "rewards/accuracies": 1.0, - "rewards/chosen": 0.37151023745536804, - "rewards/margins": 10.325210571289062, - "rewards/rejected": -9.953700065612793, + "rewards/chosen": 0.9733927845954895, + "rewards/margins": 12.348105430603027, + "rewards/rejected": -11.374711990356445, "step": 4990 }, { "epoch": 2.58, "learning_rate": 7.754828839166188e-08, - "logits/chosen": -2.783332586288452, - "logits/rejected": -2.8476181030273438, - "logps/chosen": -250.2198486328125, - "logps/rejected": -405.2004699707031, - "loss": 0.0105, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.36954087018966675, - "rewards/margins": 11.464941024780273, - "rewards/rejected": -11.834482192993164, + "logits/chosen": -2.722430944442749, + "logits/rejected": -2.7331531047821045, + "logps/chosen": -249.72158813476562, + "logps/rejected": -347.60052490234375, + "loss": 0.0097, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.32078036665916443, + "rewards/margins": 12.321962356567383, + "rewards/rejected": -12.001180648803711, "step": 5000 }, { "epoch": 2.58, - "eval_logits/chosen": -2.7497642040252686, - "eval_logits/rejected": -2.814385175704956, - "eval_logps/chosen": -269.3044738769531, - "eval_logps/rejected": -358.25360107421875, - "eval_loss": 0.5969280004501343, - "eval_rewards/accuracies": 0.8119999766349792, - "eval_rewards/chosen": -2.162431240081787, - "eval_rewards/margins": 6.049188137054443, - "eval_rewards/rejected": -8.211620330810547, - "eval_runtime": 278.5346, - "eval_samples_per_second": 7.18, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.727210521697998, + "eval_logits/rejected": -2.7532944679260254, + "eval_logps/chosen": -258.7012634277344, + "eval_logps/rejected": -325.6721496582031, + "eval_loss": 0.40755683183670044, + "eval_rewards/accuracies": 0.8700000047683716, + "eval_rewards/chosen": -1.0886754989624023, + "eval_rewards/margins": 8.905159950256348, + "eval_rewards/rejected": -9.993836402893066, + "eval_runtime": 276.1188, + "eval_samples_per_second": 7.243, + "eval_steps_per_second": 0.453, "step": 5000 }, { "epoch": 2.59, "learning_rate": 7.6592082616179e-08, - "logits/chosen": -2.7811551094055176, - "logits/rejected": -2.8356614112854004, - "logps/chosen": -245.42977905273438, - "logps/rejected": -425.71038818359375, - "loss": 0.009, + "logits/chosen": -2.739668607711792, + "logits/rejected": -2.7455170154571533, + "logps/chosen": -238.7808837890625, + "logps/rejected": -400.61041259765625, + "loss": 0.0078, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.30897045135498047, - "rewards/margins": 11.47396183013916, - "rewards/rejected": -11.78293228149414, + "rewards/chosen": 0.35591837763786316, + "rewards/margins": 12.577946662902832, + "rewards/rejected": -12.222027778625488, "step": 5010 }, { "epoch": 2.59, "learning_rate": 7.563587684069611e-08, - "logits/chosen": -2.7130038738250732, - "logits/rejected": -2.8152432441711426, - "logps/chosen": -284.6722717285156, - "logps/rejected": -402.2973937988281, - "loss": 0.0067, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.4836142063140869, - "rewards/margins": 11.877161979675293, - "rewards/rejected": -11.393548965454102, + "logits/chosen": -2.6678476333618164, + "logits/rejected": -2.7188563346862793, + "logps/chosen": -282.88995361328125, + "logps/rejected": -365.89202880859375, + "loss": 0.0172, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.6602963805198669, + "rewards/margins": 13.748291015625, + "rewards/rejected": -13.087995529174805, "step": 5020 }, { "epoch": 2.6, "learning_rate": 7.467967106521324e-08, - "logits/chosen": -2.779438018798828, - "logits/rejected": -2.834465265274048, - "logps/chosen": -247.9782257080078, - "logps/rejected": -290.38848876953125, - "loss": 0.0119, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.46960878372192383, - "rewards/margins": 9.75239372253418, - "rewards/rejected": -10.222002983093262, + "logits/chosen": -2.7129931449890137, + "logits/rejected": -2.7390971183776855, + "logps/chosen": -242.2963409423828, + "logps/rejected": -271.34002685546875, + "loss": 0.0155, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.098580002784729, + "rewards/margins": 11.570712089538574, + "rewards/rejected": -11.472132682800293, "step": 5030 }, { "epoch": 2.6, "learning_rate": 7.372346528973034e-08, - "logits/chosen": -2.828298807144165, - "logits/rejected": -2.8650214672088623, - "logps/chosen": -255.68765258789062, - "logps/rejected": -359.00189208984375, - "loss": 0.0169, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.4944925308227539, - "rewards/margins": 11.243809700012207, - "rewards/rejected": -10.749317169189453, + "logits/chosen": -2.755577802658081, + "logits/rejected": -2.7721900939941406, + "logps/chosen": -252.08432006835938, + "logps/rejected": -324.054931640625, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8548260927200317, + "rewards/margins": 12.422867774963379, + "rewards/rejected": -11.56804370880127, "step": 5040 }, { "epoch": 2.61, "learning_rate": 7.276725951424746e-08, - "logits/chosen": -2.7610697746276855, - "logits/rejected": -2.795448064804077, - "logps/chosen": -215.3785400390625, - "logps/rejected": -383.4140930175781, - "loss": 0.0185, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.4611515998840332, - "rewards/margins": 10.838452339172363, - "rewards/rejected": -11.299602508544922, + "logits/chosen": -2.7009801864624023, + "logits/rejected": -2.703165054321289, + "logps/chosen": -214.4167022705078, + "logps/rejected": -319.8914794921875, + "loss": 0.007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07689909636974335, + "rewards/margins": 12.72560977935791, + "rewards/rejected": -12.648710250854492, "step": 5050 }, { "epoch": 2.61, "learning_rate": 7.181105373876457e-08, - "logits/chosen": -2.779067277908325, - "logits/rejected": -2.810563802719116, - "logps/chosen": -245.2666778564453, - "logps/rejected": -347.674072265625, - "loss": 0.0127, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.05352171137928963, - "rewards/margins": 10.571831703186035, - "rewards/rejected": -10.518308639526367, + "logits/chosen": -2.703011989593506, + "logits/rejected": -2.7122387886047363, + "logps/chosen": -234.91598510742188, + "logps/rejected": -314.00115966796875, + "loss": 0.0776, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0885902643203735, + "rewards/margins": 12.535428047180176, + "rewards/rejected": -11.446836471557617, "step": 5060 }, { "epoch": 2.62, "learning_rate": 7.08548479632817e-08, - "logits/chosen": -2.7645630836486816, - "logits/rejected": -2.807478427886963, - "logps/chosen": -301.7003173828125, - "logps/rejected": -394.48809814453125, - "loss": 0.0087, + "logits/chosen": -2.6733107566833496, + "logits/rejected": -2.6843533515930176, + "logps/chosen": -301.4144287109375, + "logps/rejected": -345.72998046875, + "loss": 0.0063, "rewards/accuracies": 1.0, - "rewards/chosen": 0.8143863677978516, - "rewards/margins": 12.549185752868652, - "rewards/rejected": -11.734800338745117, + "rewards/chosen": 0.8402122259140015, + "rewards/margins": 13.855137825012207, + "rewards/rejected": -13.014925003051758, "step": 5070 }, { "epoch": 2.62, "learning_rate": 6.98986421877988e-08, - "logits/chosen": -2.80440616607666, - "logits/rejected": -2.832104206085205, - "logps/chosen": -273.5317077636719, - "logps/rejected": -410.95965576171875, - "loss": 0.0142, + "logits/chosen": -2.741243362426758, + "logits/rejected": -2.733663320541382, + "logps/chosen": -271.98486328125, + "logps/rejected": -349.08197021484375, + "loss": 0.0201, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.07825110107660294, - "rewards/margins": 10.619649887084961, - "rewards/rejected": -10.541399955749512, + "rewards/chosen": 0.6460112929344177, + "rewards/margins": 12.609220504760742, + "rewards/rejected": -11.96320915222168, "step": 5080 }, { "epoch": 2.63, "learning_rate": 6.894243641231592e-08, - "logits/chosen": -2.7227680683135986, - "logits/rejected": -2.7704923152923584, - "logps/chosen": -265.0140075683594, - "logps/rejected": -402.3088073730469, - "loss": 0.0104, + "logits/chosen": -2.6444013118743896, + "logits/rejected": -2.647550106048584, + "logps/chosen": -261.9896545410156, + "logps/rejected": -355.0357971191406, + "loss": 0.0054, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.47135013341903687, - "rewards/margins": 12.040913581848145, - "rewards/rejected": -11.569562911987305, + "rewards/chosen": 0.7737864255905151, + "rewards/margins": 13.506382942199707, + "rewards/rejected": -12.732596397399902, "step": 5090 }, { "epoch": 2.63, "learning_rate": 6.798623063683305e-08, - "logits/chosen": -2.719788074493408, - "logits/rejected": -2.770781993865967, - "logps/chosen": -280.3219299316406, - "logps/rejected": -383.2566833496094, - "loss": 0.0207, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.07484505325555801, - "rewards/margins": 12.587934494018555, - "rewards/rejected": -12.6627779006958, + "logits/chosen": -2.643336534500122, + "logits/rejected": -2.6624462604522705, + "logps/chosen": -275.2840270996094, + "logps/rejected": -340.90020751953125, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4289468824863434, + "rewards/margins": 13.692105293273926, + "rewards/rejected": -13.263158798217773, "step": 5100 }, { "epoch": 2.63, - "eval_logits/chosen": -2.755725145339966, - "eval_logits/rejected": -2.8196589946746826, - "eval_logps/chosen": -269.3545837402344, - "eval_logps/rejected": -358.35565185546875, - "eval_loss": 0.6007997989654541, - "eval_rewards/accuracies": 0.8119999766349792, - "eval_rewards/chosen": -2.16744327545166, - "eval_rewards/margins": 6.054382801055908, - "eval_rewards/rejected": -8.221826553344727, - "eval_runtime": 278.6404, - "eval_samples_per_second": 7.178, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.703171491622925, + "eval_logits/rejected": -2.727707624435425, + "eval_logps/chosen": -258.44683837890625, + "eval_logps/rejected": -326.0480651855469, + "eval_loss": 0.4083701968193054, + "eval_rewards/accuracies": 0.8700000047683716, + "eval_rewards/chosen": -1.063233494758606, + "eval_rewards/margins": 8.968192100524902, + "eval_rewards/rejected": -10.031425476074219, + "eval_runtime": 276.4067, + "eval_samples_per_second": 7.236, + "eval_steps_per_second": 0.452, "step": 5100 }, { "epoch": 2.64, "learning_rate": 6.703002486135017e-08, - "logits/chosen": -2.7945752143859863, - "logits/rejected": -2.8086471557617188, - "logps/chosen": -247.31655883789062, - "logps/rejected": -398.75177001953125, - "loss": 0.0186, + "logits/chosen": -2.694871187210083, + "logits/rejected": -2.6700923442840576, + "logps/chosen": -250.1337890625, + "logps/rejected": -336.16094970703125, + "loss": 0.0137, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.4659408628940582, - "rewards/margins": 11.80299186706543, - "rewards/rejected": -12.268933296203613, + "rewards/chosen": -0.7494642734527588, + "rewards/margins": 12.755358695983887, + "rewards/rejected": -13.5048246383667, "step": 5110 }, { "epoch": 2.64, "learning_rate": 6.607381908586727e-08, - "logits/chosen": -2.809502601623535, - "logits/rejected": -2.7968926429748535, - "logps/chosen": -258.90118408203125, - "logps/rejected": -439.64849853515625, - "loss": 0.0146, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.32113468647003174, - "rewards/margins": 11.847344398498535, - "rewards/rejected": -11.526209831237793, + "logits/chosen": -2.7246272563934326, + "logits/rejected": -2.670262336730957, + "logps/chosen": -263.38006591796875, + "logps/rejected": -375.41119384765625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.031157981604337692, + "rewards/margins": 13.083641052246094, + "rewards/rejected": -13.114797592163086, "step": 5120 }, { "epoch": 2.65, "learning_rate": 6.511761331038438e-08, - "logits/chosen": -2.8286375999450684, - "logits/rejected": -2.859506130218506, - "logps/chosen": -230.7789764404297, - "logps/rejected": -376.81695556640625, - "loss": 0.0089, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.6774338483810425, - "rewards/margins": 10.802311897277832, - "rewards/rejected": -11.479743957519531, + "logits/chosen": -2.7536988258361816, + "logits/rejected": -2.754511594772339, + "logps/chosen": -224.62551879882812, + "logps/rejected": -321.27471923828125, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06338664144277573, + "rewards/margins": 12.562700271606445, + "rewards/rejected": -12.62608814239502, "step": 5130 }, { "epoch": 2.65, "learning_rate": 6.416140753490151e-08, - "logits/chosen": -2.7911789417266846, - "logits/rejected": -2.8675835132598877, - "logps/chosen": -271.16534423828125, - "logps/rejected": -420.4945373535156, - "loss": 0.0108, + "logits/chosen": -2.7295079231262207, + "logits/rejected": -2.760415554046631, + "logps/chosen": -268.64569091796875, + "logps/rejected": -372.0501403808594, + "loss": 0.0072, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.7790706753730774, - "rewards/margins": 11.275418281555176, - "rewards/rejected": -10.49634838104248, + "rewards/chosen": 1.0383787155151367, + "rewards/margins": 12.919191360473633, + "rewards/rejected": -11.88081169128418, "step": 5140 }, { "epoch": 2.66, "learning_rate": 6.320520175941863e-08, - "logits/chosen": -2.7352991104125977, - "logits/rejected": -2.7836477756500244, - "logps/chosen": -245.20999145507812, - "logps/rejected": -382.8066101074219, - "loss": 0.0084, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.1987796127796173, - "rewards/margins": 11.559615135192871, - "rewards/rejected": -11.758394241333008, + "logits/chosen": -2.639681577682495, + "logits/rejected": -2.6639719009399414, + "logps/chosen": -238.30734252929688, + "logps/rejected": -338.4950866699219, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4911627769470215, + "rewards/margins": 13.771158218383789, + "rewards/rejected": -13.279995918273926, "step": 5150 }, { "epoch": 2.66, "learning_rate": 6.224899598393573e-08, - "logits/chosen": -2.826261281967163, - "logits/rejected": -2.8705601692199707, - "logps/chosen": -322.09613037109375, - "logps/rejected": -440.3390197753906, - "loss": 0.0125, + "logits/chosen": -2.7412397861480713, + "logits/rejected": -2.7424371242523193, + "logps/chosen": -315.08062744140625, + "logps/rejected": -383.92327880859375, + "loss": 0.0096, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9744750261306763, - "rewards/margins": 12.657623291015625, - "rewards/rejected": -11.683148384094238, + "rewards/chosen": 1.6760326623916626, + "rewards/margins": 14.238224983215332, + "rewards/rejected": -12.5621919631958, "step": 5160 }, { "epoch": 2.67, "learning_rate": 6.129279020845286e-08, - "logits/chosen": -2.837642192840576, - "logits/rejected": -2.821343421936035, - "logps/chosen": -219.88204956054688, - "logps/rejected": -346.25341796875, - "loss": 0.0125, + "logits/chosen": -2.7472915649414062, + "logits/rejected": -2.7116830348968506, + "logps/chosen": -213.22177124023438, + "logps/rejected": -325.4984130859375, + "loss": 0.0086, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.9648548364639282, - "rewards/margins": 12.348124504089355, - "rewards/rejected": -11.383270263671875, + "rewards/chosen": 1.628686547279358, + "rewards/margins": 13.93322467803955, + "rewards/rejected": -12.304536819458008, "step": 5170 }, { "epoch": 2.67, "learning_rate": 6.033658443296998e-08, - "logits/chosen": -2.808182716369629, - "logits/rejected": -2.8574142456054688, - "logps/chosen": -260.39801025390625, - "logps/rejected": -390.830322265625, - "loss": 0.0095, + "logits/chosen": -2.7346999645233154, + "logits/rejected": -2.7753851413726807, + "logps/chosen": -253.5308074951172, + "logps/rejected": -367.77899169921875, + "loss": 0.0121, "rewards/accuracies": 1.0, - "rewards/chosen": 0.6420486569404602, - "rewards/margins": 11.35422420501709, - "rewards/rejected": -10.712176322937012, + "rewards/chosen": 1.3358858823776245, + "rewards/margins": 14.44952392578125, + "rewards/rejected": -13.113639831542969, "step": 5180 }, { "epoch": 2.68, "learning_rate": 5.9380378657487085e-08, - "logits/chosen": -2.784289836883545, - "logits/rejected": -2.85602068901062, - "logps/chosen": -277.7592468261719, - "logps/rejected": -387.52587890625, - "loss": 0.0061, + "logits/chosen": -2.7128677368164062, + "logits/rejected": -2.723327398300171, + "logps/chosen": -275.9551086425781, + "logps/rejected": -342.8014221191406, + "loss": 0.0034, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.17926214635372162, - "rewards/margins": 11.136748313903809, - "rewards/rejected": -10.957486152648926, + "rewards/chosen": 1.0817997455596924, + "rewards/margins": 13.455174446105957, + "rewards/rejected": -12.373374938964844, "step": 5190 }, { "epoch": 2.68, "learning_rate": 5.842417288200421e-08, - "logits/chosen": -2.789968729019165, - "logits/rejected": -2.866093158721924, - "logps/chosen": -285.8483581542969, - "logps/rejected": -409.1543273925781, - "loss": 0.0103, + "logits/chosen": -2.701085090637207, + "logits/rejected": -2.7716479301452637, + "logps/chosen": -280.3232421875, + "logps/rejected": -359.11541748046875, + "loss": 0.0072, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.21996426582336426, - "rewards/margins": 11.31649112701416, - "rewards/rejected": -11.096527099609375, + "rewards/chosen": 0.7724751234054565, + "rewards/margins": 13.861913681030273, + "rewards/rejected": -13.089439392089844, "step": 5200 }, { "epoch": 2.68, - "eval_logits/chosen": -2.7545626163482666, - "eval_logits/rejected": -2.8181042671203613, - "eval_logps/chosen": -271.59014892578125, - "eval_logps/rejected": -362.2855529785156, - "eval_loss": 0.621418833732605, - "eval_rewards/accuracies": 0.8059999942779541, - "eval_rewards/chosen": -2.391003370285034, - "eval_rewards/margins": 6.22381067276001, - "eval_rewards/rejected": -8.614813804626465, - "eval_runtime": 278.4224, - "eval_samples_per_second": 7.183, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.6952240467071533, + "eval_logits/rejected": -2.719881296157837, + "eval_logps/chosen": -260.15386962890625, + "eval_logps/rejected": -328.7514343261719, + "eval_loss": 0.4144607186317444, + "eval_rewards/accuracies": 0.8700000047683716, + "eval_rewards/chosen": -1.2339369058609009, + "eval_rewards/margins": 9.06782341003418, + "eval_rewards/rejected": -10.301759719848633, + "eval_runtime": 276.5621, + "eval_samples_per_second": 7.232, + "eval_steps_per_second": 0.452, "step": 5200 }, { "epoch": 2.69, "learning_rate": 5.7467967106521317e-08, - "logits/chosen": -2.7700493335723877, - "logits/rejected": -2.8540453910827637, - "logps/chosen": -192.63064575195312, - "logps/rejected": -372.6647644042969, - "loss": 0.0047, + "logits/chosen": -2.684217929840088, + "logits/rejected": -2.745728015899658, + "logps/chosen": -188.39602661132812, + "logps/rejected": -332.06402587890625, + "loss": 0.0038, "rewards/accuracies": 1.0, - "rewards/chosen": -0.06132305786013603, - "rewards/margins": 11.770467758178711, - "rewards/rejected": -11.831789016723633, + "rewards/chosen": 0.3621380031108856, + "rewards/margins": 12.54680061340332, + "rewards/rejected": -12.184662818908691, "step": 5210 }, { "epoch": 2.69, "learning_rate": 5.651176133103844e-08, - "logits/chosen": -2.817718029022217, - "logits/rejected": -2.8606326580047607, - "logps/chosen": -231.86410522460938, - "logps/rejected": -427.46014404296875, - "loss": 0.0143, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.520971417427063, - "rewards/margins": 13.073326110839844, - "rewards/rejected": -12.55235481262207, + "logits/chosen": -2.745180130004883, + "logits/rejected": -2.757676124572754, + "logps/chosen": -228.249267578125, + "logps/rejected": -391.50286865234375, + "loss": 0.0172, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.883403480052948, + "rewards/margins": 14.649864196777344, + "rewards/rejected": -13.766461372375488, "step": 5220 }, { "epoch": 2.7, "learning_rate": 5.555555555555555e-08, - "logits/chosen": -2.744419574737549, - "logits/rejected": -2.8055806159973145, - "logps/chosen": -279.8837890625, - "logps/rejected": -321.3486022949219, - "loss": 0.0248, + "logits/chosen": -2.650338888168335, + "logits/rejected": -2.6678197383880615, + "logps/chosen": -271.56805419921875, + "logps/rejected": -283.3043518066406, + "loss": 0.0109, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.1986587941646576, - "rewards/margins": 10.663652420043945, - "rewards/rejected": -10.862310409545898, + "rewards/chosen": 0.635051429271698, + "rewards/margins": 13.029541015625, + "rewards/rejected": -12.394489288330078, "step": 5230 }, { "epoch": 2.71, "learning_rate": 5.459934978007267e-08, - "logits/chosen": -2.812368631362915, - "logits/rejected": -2.8478336334228516, - "logps/chosen": -265.64459228515625, - "logps/rejected": -392.9212951660156, - "loss": 0.0046, + "logits/chosen": -2.7157227993011475, + "logits/rejected": -2.7064871788024902, + "logps/chosen": -261.5899658203125, + "logps/rejected": -350.8445129394531, + "loss": 0.0058, "rewards/accuracies": 1.0, - "rewards/chosen": 0.24865750968456268, - "rewards/margins": 12.445100784301758, - "rewards/rejected": -12.196441650390625, + "rewards/chosen": 0.6557003855705261, + "rewards/margins": 15.282397270202637, + "rewards/rejected": -14.62669849395752, "step": 5240 }, { "epoch": 2.71, "learning_rate": 5.3643144004589786e-08, - "logits/chosen": -2.753692150115967, - "logits/rejected": -2.7644002437591553, - "logps/chosen": -251.43661499023438, - "logps/rejected": -412.4212341308594, - "loss": 0.004, + "logits/chosen": -2.6575796604156494, + "logits/rejected": -2.64365816116333, + "logps/chosen": -256.532958984375, + "logps/rejected": -310.75543212890625, + "loss": 0.0022, "rewards/accuracies": 1.0, - "rewards/chosen": 1.4879121780395508, - "rewards/margins": 13.612544059753418, - "rewards/rejected": -12.124631881713867, + "rewards/chosen": 0.9336905479431152, + "rewards/margins": 13.719060897827148, + "rewards/rejected": -12.785371780395508, "step": 5250 }, { "epoch": 2.72, "learning_rate": 5.26869382291069e-08, - "logits/chosen": -2.7969448566436768, - "logits/rejected": -2.845008373260498, - "logps/chosen": -232.2137908935547, - "logps/rejected": -344.79449462890625, - "loss": 0.0113, + "logits/chosen": -2.6819491386413574, + "logits/rejected": -2.6821415424346924, + "logps/chosen": -223.9451446533203, + "logps/rejected": -325.04522705078125, + "loss": 0.0047, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.3992539942264557, - "rewards/margins": 10.323373794555664, - "rewards/rejected": -10.722628593444824, + "rewards/chosen": 0.2814604938030243, + "rewards/margins": 12.506397247314453, + "rewards/rejected": -12.224936485290527, "step": 5260 }, { "epoch": 2.72, "learning_rate": 5.173073245362402e-08, - "logits/chosen": -2.737816572189331, - "logits/rejected": -2.803926706314087, - "logps/chosen": -263.85369873046875, - "logps/rejected": -402.1507263183594, - "loss": 0.0102, + "logits/chosen": -2.632477045059204, + "logits/rejected": -2.6796512603759766, + "logps/chosen": -258.5521240234375, + "logps/rejected": -359.0589294433594, + "loss": 0.008, "rewards/accuracies": 1.0, - "rewards/chosen": -0.2205667942762375, - "rewards/margins": 11.85888671875, - "rewards/rejected": -12.079452514648438, + "rewards/chosen": 0.30959171056747437, + "rewards/margins": 13.4892578125, + "rewards/rejected": -13.179666519165039, "step": 5270 }, { "epoch": 2.73, "learning_rate": 5.077452667814113e-08, - "logits/chosen": -2.7733945846557617, - "logits/rejected": -2.7655272483825684, - "logps/chosen": -239.1111297607422, - "logps/rejected": -379.08404541015625, - "loss": 0.0142, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.969031810760498, - "rewards/margins": 11.643916130065918, - "rewards/rejected": -12.612947463989258, + "logits/chosen": -2.6577260494232178, + "logits/rejected": -2.6414647102355957, + "logps/chosen": -227.7574462890625, + "logps/rejected": -329.0008239746094, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1092122420668602, + "rewards/margins": 13.42505168914795, + "rewards/rejected": -13.315838813781738, "step": 5280 }, { "epoch": 2.73, "learning_rate": 4.981832090265825e-08, - "logits/chosen": -2.727570056915283, - "logits/rejected": -2.7912638187408447, - "logps/chosen": -214.1581573486328, - "logps/rejected": -376.3984375, - "loss": 0.0089, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.15673276782035828, - "rewards/margins": 11.739975929260254, - "rewards/rejected": -11.896708488464355, + "logits/chosen": -2.6475157737731934, + "logits/rejected": -2.6559910774230957, + "logps/chosen": -206.3763885498047, + "logps/rejected": -337.2123718261719, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5923618078231812, + "rewards/margins": 13.979329109191895, + "rewards/rejected": -13.386964797973633, "step": 5290 }, { "epoch": 2.74, "learning_rate": 4.8862115127175364e-08, - "logits/chosen": -2.8189170360565186, - "logits/rejected": -2.8516533374786377, - "logps/chosen": -282.37646484375, - "logps/rejected": -383.95074462890625, - "loss": 0.0035, + "logits/chosen": -2.7450733184814453, + "logits/rejected": -2.7586669921875, + "logps/chosen": -284.9908447265625, + "logps/rejected": -332.6314392089844, + "loss": 0.0012, "rewards/accuracies": 1.0, - "rewards/chosen": 0.01846367120742798, - "rewards/margins": 11.932182312011719, - "rewards/rejected": -11.913717269897461, + "rewards/chosen": 0.8411874771118164, + "rewards/margins": 14.238296508789062, + "rewards/rejected": -13.397109985351562, "step": 5300 }, { "epoch": 2.74, - "eval_logits/chosen": -2.7435574531555176, - "eval_logits/rejected": -2.804840326309204, - "eval_logps/chosen": -270.68603515625, - "eval_logps/rejected": -360.4677429199219, - "eval_loss": 0.6089810132980347, - "eval_rewards/accuracies": 0.8119999766349792, - "eval_rewards/chosen": -2.3005878925323486, - "eval_rewards/margins": 6.132449150085449, - "eval_rewards/rejected": -8.433036804199219, - "eval_runtime": 278.4607, - "eval_samples_per_second": 7.182, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.68435001373291, + "eval_logits/rejected": -2.7093894481658936, + "eval_logps/chosen": -259.23260498046875, + "eval_logps/rejected": -328.7573547363281, + "eval_loss": 0.4162800908088684, + "eval_rewards/accuracies": 0.8679999709129333, + "eval_rewards/chosen": -1.141809105873108, + "eval_rewards/margins": 9.160543441772461, + "eval_rewards/rejected": -10.302351951599121, + "eval_runtime": 276.2808, + "eval_samples_per_second": 7.239, + "eval_steps_per_second": 0.452, "step": 5300 }, { "epoch": 2.74, "learning_rate": 4.790590935169248e-08, - "logits/chosen": -2.776038646697998, - "logits/rejected": -2.8061070442199707, - "logps/chosen": -217.2928924560547, - "logps/rejected": -490.9940490722656, - "loss": 0.0092, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.34224528074264526, - "rewards/margins": 11.75390911102295, - "rewards/rejected": -11.411664962768555, + "logits/chosen": -2.7030704021453857, + "logits/rejected": -2.6802847385406494, + "logps/chosen": -214.1367950439453, + "logps/rejected": -417.94024658203125, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6570671796798706, + "rewards/margins": 13.37720012664795, + "rewards/rejected": -12.720132827758789, "step": 5310 }, { "epoch": 2.75, "learning_rate": 4.69497035762096e-08, - "logits/chosen": -2.762821674346924, - "logits/rejected": -2.774388313293457, - "logps/chosen": -233.3072509765625, - "logps/rejected": -377.55517578125, - "loss": 0.009, + "logits/chosen": -2.6779661178588867, + "logits/rejected": -2.682013750076294, + "logps/chosen": -227.499267578125, + "logps/rejected": -316.4037780761719, + "loss": 0.0219, "rewards/accuracies": 1.0, - "rewards/chosen": -0.8804025650024414, - "rewards/margins": 12.22156810760498, - "rewards/rejected": -13.101969718933105, + "rewards/chosen": -0.30000215768814087, + "rewards/margins": 13.721844673156738, + "rewards/rejected": -14.02184772491455, "step": 5320 }, { "epoch": 2.75, "learning_rate": 4.599349780072671e-08, - "logits/chosen": -2.7343251705169678, - "logits/rejected": -2.776947498321533, - "logps/chosen": -225.74252319335938, - "logps/rejected": -361.77642822265625, - "loss": 0.009, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.6561448574066162, - "rewards/margins": 12.739583969116211, - "rewards/rejected": -12.083440780639648, + "logits/chosen": -2.6654956340789795, + "logits/rejected": -2.678917646408081, + "logps/chosen": -226.9166259765625, + "logps/rejected": -341.9263916015625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5382800102233887, + "rewards/margins": 13.8299560546875, + "rewards/rejected": -13.29167652130127, "step": 5330 }, { "epoch": 2.76, "learning_rate": 4.5037292025243834e-08, - "logits/chosen": -2.7473442554473877, - "logits/rejected": -2.801457166671753, - "logps/chosen": -312.99407958984375, - "logps/rejected": -443.25592041015625, - "loss": 0.0175, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": -0.4658606946468353, - "rewards/margins": 10.478414535522461, - "rewards/rejected": -10.944275856018066, + "logits/chosen": -2.681983232498169, + "logits/rejected": -2.72087025642395, + "logps/chosen": -310.365966796875, + "logps/rejected": -350.5702209472656, + "loss": 0.0094, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.20096246898174286, + "rewards/margins": 13.463506698608398, + "rewards/rejected": -13.664468765258789, "step": 5340 }, { "epoch": 2.76, "learning_rate": 4.408108624976094e-08, - "logits/chosen": -2.8104164600372314, - "logits/rejected": -2.8661699295043945, - "logps/chosen": -202.13577270507812, - "logps/rejected": -429.741455078125, - "loss": 0.0087, + "logits/chosen": -2.745481252670288, + "logits/rejected": -2.7682785987854004, + "logps/chosen": -199.31781005859375, + "logps/rejected": -363.5287780761719, + "loss": 0.0042, "rewards/accuracies": 1.0, - "rewards/chosen": 1.3201662302017212, - "rewards/margins": 14.21458911895752, - "rewards/rejected": -12.89442253112793, + "rewards/chosen": 1.601332664489746, + "rewards/margins": 13.975451469421387, + "rewards/rejected": -12.374117851257324, "step": 5350 }, { "epoch": 2.77, "learning_rate": 4.3124880474278065e-08, - "logits/chosen": -2.8079819679260254, - "logits/rejected": -2.870008945465088, - "logps/chosen": -219.47933959960938, - "logps/rejected": -364.24273681640625, - "loss": 0.0179, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.32802659273147583, - "rewards/margins": 11.084989547729492, - "rewards/rejected": -10.756962776184082, + "logits/chosen": -2.7530078887939453, + "logits/rejected": -2.7754955291748047, + "logps/chosen": -217.3152618408203, + "logps/rejected": -323.1632385253906, + "loss": 0.0295, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5262772440910339, + "rewards/margins": 12.508352279663086, + "rewards/rejected": -11.982074737548828, "step": 5360 }, { "epoch": 2.77, "learning_rate": 4.2168674698795174e-08, - "logits/chosen": -2.738285779953003, - "logits/rejected": -2.7943713665008545, - "logps/chosen": -246.6175537109375, - "logps/rejected": -403.51226806640625, - "loss": 0.0056, + "logits/chosen": -2.6663436889648438, + "logits/rejected": -2.7151029109954834, + "logps/chosen": -245.7550506591797, + "logps/rejected": -360.86346435546875, + "loss": 0.0047, "rewards/accuracies": 1.0, - "rewards/chosen": 0.01692439243197441, - "rewards/margins": 12.024803161621094, - "rewards/rejected": -12.007880210876465, + "rewards/chosen": 0.10317431390285492, + "rewards/margins": 13.10406494140625, + "rewards/rejected": -13.000892639160156, "step": 5370 }, { "epoch": 2.78, "learning_rate": 4.1212468923312296e-08, - "logits/chosen": -2.6675705909729004, - "logits/rejected": -2.708002805709839, - "logps/chosen": -237.9582061767578, - "logps/rejected": -425.62200927734375, - "loss": 0.0122, + "logits/chosen": -2.599959373474121, + "logits/rejected": -2.619191884994507, + "logps/chosen": -233.2869873046875, + "logps/rejected": -407.2906188964844, + "loss": 0.0062, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.05177364498376846, - "rewards/margins": 11.023565292358398, - "rewards/rejected": -11.075338363647461, + "rewards/chosen": 0.41534656286239624, + "rewards/margins": 13.228411674499512, + "rewards/rejected": -12.813066482543945, "step": 5380 }, { "epoch": 2.78, "learning_rate": 4.025626314782941e-08, - "logits/chosen": -2.641225814819336, - "logits/rejected": -2.7300140857696533, - "logps/chosen": -268.275146484375, - "logps/rejected": -392.2470397949219, - "loss": 0.01, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6281940937042236, - "rewards/margins": 12.283098220825195, - "rewards/rejected": -11.65490436553955, + "logits/chosen": -2.549999952316284, + "logits/rejected": -2.5964624881744385, + "logps/chosen": -263.39276123046875, + "logps/rejected": -326.8229064941406, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1186546087265015, + "rewards/margins": 14.925954818725586, + "rewards/rejected": -13.807299613952637, "step": 5390 }, { "epoch": 2.79, "learning_rate": 3.930005737234653e-08, - "logits/chosen": -2.786956310272217, - "logits/rejected": -2.852847099304199, - "logps/chosen": -188.55160522460938, - "logps/rejected": -326.9624938964844, - "loss": 0.0145, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.005254592280834913, - "rewards/margins": 10.982970237731934, - "rewards/rejected": -10.977715492248535, + "logits/chosen": -2.717440366744995, + "logits/rejected": -2.7366092205047607, + "logps/chosen": -185.8636932373047, + "logps/rejected": -313.39752197265625, + "loss": 0.0098, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2728700041770935, + "rewards/margins": 13.362249374389648, + "rewards/rejected": -13.089380264282227, "step": 5400 }, { "epoch": 2.79, - "eval_logits/chosen": -2.7451467514038086, - "eval_logits/rejected": -2.805893898010254, - "eval_logps/chosen": -268.75567626953125, - "eval_logps/rejected": -358.093017578125, - "eval_loss": 0.60563725233078, - "eval_rewards/accuracies": 0.8119999766349792, - "eval_rewards/chosen": -2.1075518131256104, - "eval_rewards/margins": 6.088010311126709, - "eval_rewards/rejected": -8.195561408996582, - "eval_runtime": 278.6585, - "eval_samples_per_second": 7.177, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.6969242095947266, + "eval_logits/rejected": -2.7278921604156494, + "eval_logps/chosen": -259.44915771484375, + "eval_logps/rejected": -329.6948547363281, + "eval_loss": 0.42122241854667664, + "eval_rewards/accuracies": 0.8640000224113464, + "eval_rewards/chosen": -1.163465142250061, + "eval_rewards/margins": 9.232636451721191, + "eval_rewards/rejected": -10.396101951599121, + "eval_runtime": 277.0198, + "eval_samples_per_second": 7.22, + "eval_steps_per_second": 0.451, "step": 5400 }, { "epoch": 2.79, "learning_rate": 3.8343851596863644e-08, - "logits/chosen": -2.7843363285064697, - "logits/rejected": -2.8077871799468994, - "logps/chosen": -202.0702362060547, - "logps/rejected": -344.1629333496094, - "loss": 0.0042, + "logits/chosen": -2.712212085723877, + "logits/rejected": -2.703068971633911, + "logps/chosen": -199.80270385742188, + "logps/rejected": -319.6861267089844, + "loss": 0.0016, "rewards/accuracies": 1.0, - "rewards/chosen": 0.09458614885807037, - "rewards/margins": 11.792885780334473, - "rewards/rejected": -11.698301315307617, + "rewards/chosen": 0.321342796087265, + "rewards/margins": 12.824358940124512, + "rewards/rejected": -12.503016471862793, "step": 5410 }, { "epoch": 2.8, "learning_rate": 3.738764582138076e-08, - "logits/chosen": -2.7376439571380615, - "logits/rejected": -2.723874568939209, - "logps/chosen": -299.224609375, - "logps/rejected": -439.83123779296875, - "loss": 0.0434, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.3747190833091736, - "rewards/margins": 11.698442459106445, - "rewards/rejected": -12.073161125183105, + "logits/chosen": -2.676004409790039, + "logits/rejected": -2.64583158493042, + "logps/chosen": -296.0727233886719, + "logps/rejected": -373.59814453125, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0581778883934021, + "rewards/margins": 13.742734909057617, + "rewards/rejected": -13.80091381072998, "step": 5420 }, { "epoch": 2.8, "learning_rate": 3.6431440045897875e-08, - "logits/chosen": -2.7386252880096436, - "logits/rejected": -2.763640880584717, - "logps/chosen": -245.27920532226562, - "logps/rejected": -422.24578857421875, - "loss": 0.0166, + "logits/chosen": -2.6729485988616943, + "logits/rejected": -2.677185535430908, + "logps/chosen": -245.58139038085938, + "logps/rejected": -396.90863037109375, + "loss": 0.036, "rewards/accuracies": 1.0, - "rewards/chosen": 0.29534271359443665, - "rewards/margins": 11.861417770385742, - "rewards/rejected": -11.566075325012207, + "rewards/chosen": 0.2651239335536957, + "rewards/margins": 13.865732192993164, + "rewards/rejected": -13.600606918334961, "step": 5430 }, { "epoch": 2.81, "learning_rate": 3.547523427041499e-08, - "logits/chosen": -2.8376080989837646, - "logits/rejected": -2.8734335899353027, - "logps/chosen": -261.5392761230469, - "logps/rejected": -430.12628173828125, - "loss": 0.009, + "logits/chosen": -2.7551958560943604, + "logits/rejected": -2.7643485069274902, + "logps/chosen": -259.32513427734375, + "logps/rejected": -381.5841064453125, + "loss": 0.0044, "rewards/accuracies": 1.0, - "rewards/chosen": 0.30467402935028076, - "rewards/margins": 11.386775970458984, - "rewards/rejected": -11.08210277557373, + "rewards/chosen": 0.5246171951293945, + "rewards/margins": 14.007672309875488, + "rewards/rejected": -13.483057022094727, "step": 5440 }, { "epoch": 2.81, "learning_rate": 3.4519028494932106e-08, - "logits/chosen": -2.691027879714966, - "logits/rejected": -2.7183306217193604, - "logps/chosen": -277.2605895996094, - "logps/rejected": -429.47998046875, - "loss": 0.0031, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7162610292434692, - "rewards/margins": 12.104765892028809, - "rewards/rejected": -11.388504028320312, + "logits/chosen": -2.621220350265503, + "logits/rejected": -2.6195569038391113, + "logps/chosen": -272.18414306640625, + "logps/rejected": -396.24322509765625, + "loss": 0.0148, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.2221803665161133, + "rewards/margins": 13.50122356414795, + "rewards/rejected": -12.279044151306152, "step": 5450 }, { "epoch": 2.82, "learning_rate": 3.356282271944923e-08, - "logits/chosen": -2.787083387374878, - "logits/rejected": -2.8620715141296387, - "logps/chosen": -223.35128784179688, - "logps/rejected": -418.36273193359375, - "loss": 0.0158, + "logits/chosen": -2.7143969535827637, + "logits/rejected": -2.735429525375366, + "logps/chosen": -222.1768035888672, + "logps/rejected": -363.3831481933594, + "loss": 0.0108, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.23595857620239258, - "rewards/margins": 11.202669143676758, - "rewards/rejected": -11.438629150390625, + "rewards/chosen": -0.11973223835229874, + "rewards/margins": 13.167340278625488, + "rewards/rejected": -13.287071228027344, "step": 5460 }, { "epoch": 2.82, "learning_rate": 3.260661694396634e-08, - "logits/chosen": -2.8234875202178955, - "logits/rejected": -2.874922275543213, - "logps/chosen": -286.9738464355469, - "logps/rejected": -367.65643310546875, - "loss": 0.0047, + "logits/chosen": -2.756863594055176, + "logits/rejected": -2.7645015716552734, + "logps/chosen": -279.4372253417969, + "logps/rejected": -354.5695495605469, + "loss": 0.0083, "rewards/accuracies": 1.0, - "rewards/chosen": 0.04693390056490898, - "rewards/margins": 11.051458358764648, - "rewards/rejected": -11.004526138305664, + "rewards/chosen": 0.8005949258804321, + "rewards/margins": 13.142946243286133, + "rewards/rejected": -12.342351913452148, "step": 5470 }, { "epoch": 2.83, "learning_rate": 3.165041116848346e-08, - "logits/chosen": -2.787111282348633, - "logits/rejected": -2.856330394744873, - "logps/chosen": -239.1962890625, - "logps/rejected": -455.84100341796875, - "loss": 0.0339, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.376078724861145, - "rewards/margins": 12.097007751464844, - "rewards/rejected": -11.720929145812988, + "logits/chosen": -2.7271335124969482, + "logits/rejected": -2.7696869373321533, + "logps/chosen": -235.31210327148438, + "logps/rejected": -420.21044921875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.768395721912384, + "rewards/margins": 14.94178581237793, + "rewards/rejected": -14.17339038848877, "step": 5480 }, { "epoch": 2.83, "learning_rate": 3.0694205393000576e-08, - "logits/chosen": -2.758702516555786, - "logits/rejected": -2.7954318523406982, - "logps/chosen": -168.82022094726562, - "logps/rejected": -369.0736083984375, - "loss": 0.0059, + "logits/chosen": -2.673858165740967, + "logits/rejected": -2.6657931804656982, + "logps/chosen": -177.02545166015625, + "logps/rejected": -341.9749450683594, + "loss": 0.005, "rewards/accuracies": 1.0, - "rewards/chosen": 0.675735592842102, - "rewards/margins": 11.410593032836914, - "rewards/rejected": -10.734857559204102, + "rewards/chosen": 0.14254388213157654, + "rewards/margins": 13.541740417480469, + "rewards/rejected": -13.399195671081543, "step": 5490 }, { "epoch": 2.84, "learning_rate": 2.9737999617517688e-08, - "logits/chosen": -2.759615898132324, - "logits/rejected": -2.7838168144226074, - "logps/chosen": -289.23992919921875, - "logps/rejected": -394.021240234375, - "loss": 0.0115, + "logits/chosen": -2.6765246391296387, + "logits/rejected": -2.663633346557617, + "logps/chosen": -295.7442626953125, + "logps/rejected": -363.14056396484375, + "loss": 0.0277, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.07635448127985, - "rewards/margins": 10.999418258666992, - "rewards/rejected": -10.923063278198242, + "rewards/chosen": -0.07735172659158707, + "rewards/margins": 13.318232536315918, + "rewards/rejected": -13.395584106445312, "step": 5500 }, { "epoch": 2.84, - "eval_logits/chosen": -2.7522401809692383, - "eval_logits/rejected": -2.813889265060425, - "eval_logps/chosen": -267.77825927734375, - "eval_logps/rejected": -356.04461669921875, - "eval_loss": 0.5964898467063904, - "eval_rewards/accuracies": 0.8159999847412109, - "eval_rewards/chosen": -2.0098071098327637, - "eval_rewards/margins": 5.9809160232543945, - "eval_rewards/rejected": -7.990723609924316, - "eval_runtime": 278.4881, - "eval_samples_per_second": 7.182, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.6941516399383545, + "eval_logits/rejected": -2.7284586429595947, + "eval_logps/chosen": -260.678466796875, + "eval_logps/rejected": -330.6889343261719, + "eval_loss": 0.41786351799964905, + "eval_rewards/accuracies": 0.871999979019165, + "eval_rewards/chosen": -1.2863978147506714, + "eval_rewards/margins": 9.209112167358398, + "eval_rewards/rejected": -10.49551010131836, + "eval_runtime": 276.7374, + "eval_samples_per_second": 7.227, + "eval_steps_per_second": 0.452, "step": 5500 }, { "epoch": 2.84, "learning_rate": 2.8781793842034804e-08, - "logits/chosen": -2.708756685256958, - "logits/rejected": -2.6932451725006104, - "logps/chosen": -203.24278259277344, - "logps/rejected": -373.3500061035156, - "loss": 0.0186, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": -0.17195767164230347, - "rewards/margins": 10.566624641418457, - "rewards/rejected": -10.738582611083984, + "logits/chosen": -2.61027193069458, + "logits/rejected": -2.607125759124756, + "logps/chosen": -208.0153045654297, + "logps/rejected": -332.18072509765625, + "loss": 0.0136, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23756606876850128, + "rewards/margins": 12.921565055847168, + "rewards/rejected": -13.15913200378418, "step": 5510 }, { "epoch": 2.85, "learning_rate": 2.782558806655192e-08, - "logits/chosen": -2.726964235305786, - "logits/rejected": -2.790684223175049, - "logps/chosen": -295.1116027832031, - "logps/rejected": -394.7130432128906, - "loss": 0.0149, + "logits/chosen": -2.648057222366333, + "logits/rejected": -2.6486594676971436, + "logps/chosen": -294.9002380371094, + "logps/rejected": -330.6114501953125, + "loss": 0.0103, "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.37252020835876465, - "rewards/margins": 11.06298542022705, - "rewards/rejected": -10.690465927124023, + "rewards/chosen": 0.39362195134162903, + "rewards/margins": 13.955568313598633, + "rewards/rejected": -13.561944961547852, "step": 5520 }, { "epoch": 2.85, "learning_rate": 2.6869382291069035e-08, - "logits/chosen": -2.790950298309326, - "logits/rejected": -2.818000555038452, - "logps/chosen": -251.74435424804688, - "logps/rejected": -388.2196960449219, - "loss": 0.0069, + "logits/chosen": -2.705298662185669, + "logits/rejected": -2.7205758094787598, + "logps/chosen": -251.78402709960938, + "logps/rejected": -368.1021423339844, + "loss": 0.004, "rewards/accuracies": 1.0, - "rewards/chosen": 1.0213242769241333, - "rewards/margins": 11.515314102172852, - "rewards/rejected": -10.493988990783691, + "rewards/chosen": 1.5880625247955322, + "rewards/margins": 15.300016403198242, + "rewards/rejected": -13.711954116821289, "step": 5530 }, { "epoch": 2.86, "learning_rate": 2.591317651558615e-08, - "logits/chosen": -2.7424418926239014, - "logits/rejected": -2.795841693878174, - "logps/chosen": -264.7119445800781, - "logps/rejected": -386.6141052246094, - "loss": 0.0052, + "logits/chosen": -2.677079200744629, + "logits/rejected": -2.686626672744751, + "logps/chosen": -260.8871765136719, + "logps/rejected": -352.76629638671875, + "loss": 0.005, "rewards/accuracies": 1.0, - "rewards/chosen": -0.02202761173248291, - "rewards/margins": 10.968466758728027, - "rewards/rejected": -10.990495681762695, + "rewards/chosen": 0.3585185110569, + "rewards/margins": 13.142423629760742, + "rewards/rejected": -12.783905982971191, "step": 5540 }, { "epoch": 2.87, "learning_rate": 2.4956970740103267e-08, - "logits/chosen": -2.7561378479003906, - "logits/rejected": -2.7803356647491455, - "logps/chosen": -244.2420196533203, - "logps/rejected": -442.84710693359375, - "loss": 0.013, + "logits/chosen": -2.686260223388672, + "logits/rejected": -2.6715543270111084, + "logps/chosen": -245.18716430664062, + "logps/rejected": -392.96319580078125, + "loss": 0.006, "rewards/accuracies": 1.0, - "rewards/chosen": -0.20751342177391052, - "rewards/margins": 11.346160888671875, - "rewards/rejected": -11.553674697875977, + "rewards/chosen": 0.19622036814689636, + "rewards/margins": 13.893022537231445, + "rewards/rejected": -13.696802139282227, "step": 5550 }, { "epoch": 2.87, "learning_rate": 2.4000764964620386e-08, - "logits/chosen": -2.789926528930664, - "logits/rejected": -2.8296680450439453, - "logps/chosen": -339.80731201171875, - "logps/rejected": -456.09735107421875, - "loss": 0.0117, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.2991110384464264, - "rewards/margins": 11.766742706298828, - "rewards/rejected": -12.065852165222168, + "logits/chosen": -2.7121362686157227, + "logits/rejected": -2.738884687423706, + "logps/chosen": -338.8020324707031, + "logps/rejected": -401.66021728515625, + "loss": 0.0087, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19621416926383972, + "rewards/margins": 12.684732437133789, + "rewards/rejected": -12.880948066711426, "step": 5560 }, { "epoch": 2.88, "learning_rate": 2.30445591891375e-08, - "logits/chosen": -2.821697473526001, - "logits/rejected": -2.8349757194519043, - "logps/chosen": -321.14044189453125, - "logps/rejected": -400.93121337890625, - "loss": 0.0102, + "logits/chosen": -2.7484335899353027, + "logits/rejected": -2.733487606048584, + "logps/chosen": -314.8695373535156, + "logps/rejected": -372.7781677246094, + "loss": 0.0083, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.1228589192032814, - "rewards/margins": 11.465642929077148, - "rewards/rejected": -11.342782974243164, + "rewards/chosen": 0.9131529927253723, + "rewards/margins": 13.522671699523926, + "rewards/rejected": -12.609518051147461, "step": 5570 }, { "epoch": 2.88, "learning_rate": 2.2088353413654617e-08, - "logits/chosen": -2.675318479537964, - "logits/rejected": -2.7310335636138916, - "logps/chosen": -190.40487670898438, - "logps/rejected": -328.63714599609375, - "loss": 0.0212, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.008567571640014648, - "rewards/margins": 11.152410507202148, - "rewards/rejected": -11.160978317260742, + "logits/chosen": -2.6008219718933105, + "logits/rejected": -2.6308541297912598, + "logps/chosen": -182.9975128173828, + "logps/rejected": -318.9915771484375, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.733214795589447, + "rewards/margins": 14.48689079284668, + "rewards/rejected": -13.753674507141113, "step": 5580 }, { "epoch": 2.89, "learning_rate": 2.1132147638171733e-08, - "logits/chosen": -2.7256431579589844, - "logits/rejected": -2.7801289558410645, - "logps/chosen": -292.7064514160156, - "logps/rejected": -366.49359130859375, - "loss": 0.0125, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.7942097187042236, - "rewards/margins": 11.075929641723633, - "rewards/rejected": -10.281720161437988, + "logits/chosen": -2.6455812454223633, + "logits/rejected": -2.6950266361236572, + "logps/chosen": -290.97705078125, + "logps/rejected": -339.4418029785156, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9671508073806763, + "rewards/margins": 13.41205883026123, + "rewards/rejected": -12.44490909576416, "step": 5590 }, { "epoch": 2.89, "learning_rate": 2.0175941862688848e-08, - "logits/chosen": -2.7792868614196777, - "logits/rejected": -2.8381927013397217, - "logps/chosen": -220.96133422851562, - "logps/rejected": -358.05316162109375, - "loss": 0.0321, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.2555966079235077, - "rewards/margins": 11.58265209197998, - "rewards/rejected": -11.327055931091309, + "logits/chosen": -2.7136659622192383, + "logits/rejected": -2.7503767013549805, + "logps/chosen": -221.965576171875, + "logps/rejected": -319.40252685546875, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15595309436321259, + "rewards/margins": 13.083261489868164, + "rewards/rejected": -12.92730712890625, "step": 5600 }, { "epoch": 2.89, - "eval_logits/chosen": -2.7509632110595703, - "eval_logits/rejected": -2.8135623931884766, - "eval_logps/chosen": -268.1117858886719, - "eval_logps/rejected": -357.17138671875, - "eval_loss": 0.6050785183906555, - "eval_rewards/accuracies": 0.8080000281333923, - "eval_rewards/chosen": -2.043166399002075, - "eval_rewards/margins": 6.06022834777832, - "eval_rewards/rejected": -8.103395462036133, - "eval_runtime": 278.5351, - "eval_samples_per_second": 7.18, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.6932334899902344, + "eval_logits/rejected": -2.727417230606079, + "eval_logps/chosen": -260.909912109375, + "eval_logps/rejected": -331.0909118652344, + "eval_loss": 0.4178406894207001, + "eval_rewards/accuracies": 0.8679999709129333, + "eval_rewards/chosen": -1.3095418214797974, + "eval_rewards/margins": 9.226164817810059, + "eval_rewards/rejected": -10.535707473754883, + "eval_runtime": 276.5173, + "eval_samples_per_second": 7.233, + "eval_steps_per_second": 0.452, "step": 5600 }, { "epoch": 2.9, "learning_rate": 1.9219736087205964e-08, - "logits/chosen": -2.7538223266601562, - "logits/rejected": -2.788954257965088, - "logps/chosen": -240.25765991210938, - "logps/rejected": -371.2794494628906, - "loss": 0.0157, + "logits/chosen": -2.669834613800049, + "logits/rejected": -2.6698246002197266, + "logps/chosen": -245.5478515625, + "logps/rejected": -325.3616638183594, + "loss": 0.0124, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": -0.055735863745212555, - "rewards/margins": 10.902502059936523, - "rewards/rejected": -10.958239555358887, + "rewards/chosen": -0.49211063981056213, + "rewards/margins": 12.923377990722656, + "rewards/rejected": -13.415489196777344, "step": 5610 }, { "epoch": 2.9, "learning_rate": 1.826353031172308e-08, - "logits/chosen": -2.747121810913086, - "logits/rejected": -2.759830951690674, - "logps/chosen": -335.62060546875, - "logps/rejected": -478.05084228515625, - "loss": 0.01, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.19696936011314392, - "rewards/margins": 11.078533172607422, - "rewards/rejected": -10.881563186645508, + "logits/chosen": -2.6600887775421143, + "logits/rejected": -2.645634412765503, + "logps/chosen": -326.98614501953125, + "logps/rejected": -442.02783203125, + "loss": 0.0075, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0604168176651, + "rewards/margins": 13.573480606079102, + "rewards/rejected": -12.513063430786133, "step": 5620 }, { "epoch": 2.91, "learning_rate": 1.73073245362402e-08, - "logits/chosen": -2.7759406566619873, - "logits/rejected": -2.8184428215026855, - "logps/chosen": -253.0712890625, - "logps/rejected": -366.9938049316406, - "loss": 0.0103, + "logits/chosen": -2.6886191368103027, + "logits/rejected": -2.7192580699920654, + "logps/chosen": -252.1400604248047, + "logps/rejected": -360.3085021972656, + "loss": 0.0086, "rewards/accuracies": 1.0, - "rewards/chosen": -0.258527547121048, - "rewards/margins": 11.674029350280762, - "rewards/rejected": -11.932558059692383, + "rewards/chosen": 0.3383222222328186, + "rewards/margins": 14.029278755187988, + "rewards/rejected": -13.690958023071289, "step": 5630 }, { "epoch": 2.91, "learning_rate": 1.6351118760757314e-08, - "logits/chosen": -2.789585590362549, - "logits/rejected": -2.819732666015625, - "logps/chosen": -250.55001831054688, - "logps/rejected": -388.7904052734375, - "loss": 0.0081, + "logits/chosen": -2.7019834518432617, + "logits/rejected": -2.7205471992492676, + "logps/chosen": -246.2549591064453, + "logps/rejected": -334.0852355957031, + "loss": 0.0069, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.04169013351202011, - "rewards/margins": 11.472790718078613, - "rewards/rejected": -11.431100845336914, + "rewards/chosen": 0.49922728538513184, + "rewards/margins": 12.960168838500977, + "rewards/rejected": -12.460942268371582, "step": 5640 }, { "epoch": 2.92, "learning_rate": 1.539491298527443e-08, - "logits/chosen": -2.7909374237060547, - "logits/rejected": -2.8411381244659424, - "logps/chosen": -233.0658721923828, - "logps/rejected": -389.83880615234375, - "loss": 0.0036, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.0178313497453928, - "rewards/margins": 11.449518203735352, - "rewards/rejected": -11.467348098754883, + "logits/chosen": -2.715240001678467, + "logits/rejected": -2.7315754890441895, + "logps/chosen": -228.40567016601562, + "logps/rejected": -364.71502685546875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4481922686100006, + "rewards/margins": 13.214696884155273, + "rewards/rejected": -12.766504287719727, "step": 5650 }, { "epoch": 2.92, "learning_rate": 1.4438707209791546e-08, - "logits/chosen": -2.77156925201416, - "logits/rejected": -2.8066821098327637, - "logps/chosen": -251.38528442382812, - "logps/rejected": -344.01568603515625, - "loss": 0.0159, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.047732848674058914, - "rewards/margins": 10.928056716918945, - "rewards/rejected": -10.88032341003418, + "logits/chosen": -2.6868813037872314, + "logits/rejected": -2.706840991973877, + "logps/chosen": -251.84774780273438, + "logps/rejected": -318.03118896484375, + "loss": 0.0115, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.0014841258525848389, + "rewards/margins": 13.272031784057617, + "rewards/rejected": -13.270547866821289, "step": 5660 }, { "epoch": 2.93, "learning_rate": 1.3482501434308661e-08, - "logits/chosen": -2.7734055519104004, - "logits/rejected": -2.776326894760132, - "logps/chosen": -257.57904052734375, - "logps/rejected": -362.7672424316406, - "loss": 0.0103, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.4579504132270813, - "rewards/margins": 10.892066955566406, - "rewards/rejected": -10.43411636352539, + "logits/chosen": -2.704782009124756, + "logits/rejected": -2.693816661834717, + "logps/chosen": -257.1002502441406, + "logps/rejected": -327.7674865722656, + "loss": 0.0118, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5079609155654907, + "rewards/margins": 12.477558135986328, + "rewards/rejected": -11.969596862792969, "step": 5670 }, { "epoch": 2.93, "learning_rate": 1.2526295658825777e-08, - "logits/chosen": -2.8509745597839355, - "logits/rejected": -2.8811259269714355, - "logps/chosen": -281.4604187011719, - "logps/rejected": -409.92816162109375, - "loss": 0.0142, + "logits/chosen": -2.786997079849243, + "logits/rejected": -2.7386133670806885, + "logps/chosen": -279.8153991699219, + "logps/rejected": -351.1986999511719, + "loss": 0.013, "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.2449922263622284, - "rewards/margins": 11.449122428894043, - "rewards/rejected": -11.204131126403809, + "rewards/chosen": 0.40878015756607056, + "rewards/margins": 13.892974853515625, + "rewards/rejected": -13.4841947555542, "step": 5680 }, { "epoch": 2.94, "learning_rate": 1.1570089883342895e-08, - "logits/chosen": -2.730487108230591, - "logits/rejected": -2.734302043914795, - "logps/chosen": -316.2430114746094, - "logps/rejected": -447.566650390625, - "loss": 0.0113, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.04693268612027168, - "rewards/margins": 12.631341934204102, - "rewards/rejected": -12.5844087600708, + "logits/chosen": -2.6509201526641846, + "logits/rejected": -2.6537039279937744, + "logps/chosen": -312.64984130859375, + "logps/rejected": -384.8985595703125, + "loss": 0.0193, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4096846580505371, + "rewards/margins": 13.473419189453125, + "rewards/rejected": -13.063733100891113, "step": 5690 }, { "epoch": 2.94, "learning_rate": 1.061388410786001e-08, - "logits/chosen": -2.731062650680542, - "logits/rejected": -2.831976890563965, - "logps/chosen": -206.12698364257812, - "logps/rejected": -342.6966552734375, - "loss": 0.0087, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.0030780285596847534, - "rewards/margins": 10.542811393737793, - "rewards/rejected": -10.539732933044434, + "logits/chosen": -2.6757428646087646, + "logits/rejected": -2.7123563289642334, + "logps/chosen": -201.38961791992188, + "logps/rejected": -315.7047119140625, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47685426473617554, + "rewards/margins": 13.87553596496582, + "rewards/rejected": -13.398681640625, "step": 5700 }, { "epoch": 2.94, - "eval_logits/chosen": -2.747497320175171, - "eval_logits/rejected": -2.810014486312866, - "eval_logps/chosen": -267.9060974121094, - "eval_logps/rejected": -357.0297546386719, - "eval_loss": 0.6040644645690918, - "eval_rewards/accuracies": 0.8140000104904175, - "eval_rewards/chosen": -2.0225942134857178, - "eval_rewards/margins": 6.066638469696045, - "eval_rewards/rejected": -8.0892333984375, - "eval_runtime": 278.4442, - "eval_samples_per_second": 7.183, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.6958701610565186, + "eval_logits/rejected": -2.7292940616607666, + "eval_logps/chosen": -259.8774719238281, + "eval_logps/rejected": -329.76837158203125, + "eval_loss": 0.41407620906829834, + "eval_rewards/accuracies": 0.8679999709129333, + "eval_rewards/chosen": -1.2062982320785522, + "eval_rewards/margins": 9.197154998779297, + "eval_rewards/rejected": -10.40345287322998, + "eval_runtime": 276.055, + "eval_samples_per_second": 7.245, + "eval_steps_per_second": 0.453, "step": 5700 }, { "epoch": 2.95, "learning_rate": 9.657678332377126e-09, - "logits/chosen": -2.7119622230529785, - "logits/rejected": -2.760438919067383, - "logps/chosen": -233.8179931640625, - "logps/rejected": -365.3365783691406, - "loss": 0.0147, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6185123920440674, - "rewards/margins": 11.846052169799805, - "rewards/rejected": -11.227540016174316, + "logits/chosen": -2.638190269470215, + "logits/rejected": -2.6677918434143066, + "logps/chosen": -230.2301483154297, + "logps/rejected": -325.7227478027344, + "loss": 0.0066, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.978204071521759, + "rewards/margins": 14.886259078979492, + "rewards/rejected": -13.908055305480957, "step": 5710 }, { "epoch": 2.95, "learning_rate": 8.701472556894243e-09, - "logits/chosen": -2.7717738151550293, - "logits/rejected": -2.7559776306152344, - "logps/chosen": -206.87277221679688, - "logps/rejected": -384.5017395019531, - "loss": 0.0089, - "rewards/accuracies": 1.0, - "rewards/chosen": -0.37782225012779236, - "rewards/margins": 11.53270435333252, - "rewards/rejected": -11.910527229309082, + "logits/chosen": -2.706774950027466, + "logits/rejected": -2.6931397914886475, + "logps/chosen": -200.55712890625, + "logps/rejected": -333.51873779296875, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2537408769130707, + "rewards/margins": 13.56867504119873, + "rewards/rejected": -13.314933776855469, "step": 5720 }, { "epoch": 2.96, "learning_rate": 7.745266781411359e-09, - "logits/chosen": -2.793654203414917, - "logits/rejected": -2.835156202316284, - "logps/chosen": -264.3742370605469, - "logps/rejected": -431.3934631347656, - "loss": 0.0203, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.7172238230705261, - "rewards/margins": 11.648252487182617, - "rewards/rejected": -10.931028366088867, + "logits/chosen": -2.7171568870544434, + "logits/rejected": -2.7356979846954346, + "logps/chosen": -269.98992919921875, + "logps/rejected": -365.6961364746094, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15525959432125092, + "rewards/margins": 13.448030471801758, + "rewards/rejected": -13.292770385742188, "step": 5730 }, { "epoch": 2.96, "learning_rate": 6.7890610059284754e-09, - "logits/chosen": -2.7689952850341797, - "logits/rejected": -2.7692933082580566, - "logps/chosen": -179.81179809570312, - "logps/rejected": -330.4810485839844, - "loss": 0.024, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.23751434683799744, - "rewards/margins": 10.173518180847168, - "rewards/rejected": -9.936005592346191, + "logits/chosen": -2.701836109161377, + "logits/rejected": -2.6965954303741455, + "logps/chosen": -170.1682586669922, + "logps/rejected": -305.47894287109375, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2018935680389404, + "rewards/margins": 13.130075454711914, + "rewards/rejected": -11.928182601928711, "step": 5740 }, { "epoch": 2.97, "learning_rate": 5.832855230445592e-09, - "logits/chosen": -2.700852870941162, - "logits/rejected": -2.771523952484131, - "logps/chosen": -291.72332763671875, - "logps/rejected": -378.6692810058594, - "loss": 0.0458, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.45632845163345337, - "rewards/margins": 11.953349113464355, - "rewards/rejected": -11.497020721435547, + "logits/chosen": -2.6432783603668213, + "logits/rejected": -2.6968448162078857, + "logps/chosen": -291.81695556640625, + "logps/rejected": -351.7596740722656, + "loss": 0.0085, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.43952980637550354, + "rewards/margins": 14.156962394714355, + "rewards/rejected": -13.717432975769043, "step": 5750 }, { "epoch": 2.97, "learning_rate": 4.8766494549627085e-09, - "logits/chosen": -2.7623281478881836, - "logits/rejected": -2.7841367721557617, - "logps/chosen": -276.88714599609375, - "logps/rejected": -383.640380859375, - "loss": 0.0099, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.8869491815567017, - "rewards/margins": 12.092225074768066, - "rewards/rejected": -11.205277442932129, + "logits/chosen": -2.6910552978515625, + "logits/rejected": -2.6976256370544434, + "logps/chosen": -277.7280578613281, + "logps/rejected": -331.58612060546875, + "loss": 0.0058, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8028562664985657, + "rewards/margins": 12.933176040649414, + "rewards/rejected": -12.130319595336914, "step": 5760 }, { "epoch": 2.98, "learning_rate": 3.920443679479824e-09, - "logits/chosen": -2.8144805431365967, - "logits/rejected": -2.8412563800811768, - "logps/chosen": -268.53143310546875, - "logps/rejected": -369.7856750488281, - "loss": 0.0176, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.38506728410720825, - "rewards/margins": 10.940896034240723, - "rewards/rejected": -10.555828094482422, + "logits/chosen": -2.7480709552764893, + "logits/rejected": -2.7490665912628174, + "logps/chosen": -267.54022216796875, + "logps/rejected": -341.2646484375, + "loss": 0.0077, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6833750009536743, + "rewards/margins": 13.129834175109863, + "rewards/rejected": -12.446457862854004, "step": 5770 }, { "epoch": 2.98, "learning_rate": 2.96423790399694e-09, - "logits/chosen": -2.7938127517700195, - "logits/rejected": -2.842219591140747, - "logps/chosen": -228.3662567138672, - "logps/rejected": -393.7285461425781, - "loss": 0.0259, - "rewards/accuracies": 1.0, - "rewards/chosen": 0.6316030025482178, - "rewards/margins": 11.577722549438477, - "rewards/rejected": -10.94611930847168, + "logits/chosen": -2.7321503162384033, + "logits/rejected": -2.741072416305542, + "logps/chosen": -228.5279083251953, + "logps/rejected": -366.03851318359375, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6154392957687378, + "rewards/margins": 13.328977584838867, + "rewards/rejected": -12.71353816986084, "step": 5780 }, { "epoch": 2.99, "learning_rate": 2.008032128514056e-09, - "logits/chosen": -2.8088643550872803, - "logits/rejected": -2.828951597213745, - "logps/chosen": -286.57672119140625, - "logps/rejected": -427.19659423828125, - "loss": 0.0115, + "logits/chosen": -2.7263731956481934, + "logits/rejected": -2.709667682647705, + "logps/chosen": -284.0499572753906, + "logps/rejected": -382.48065185546875, + "loss": 0.0059, "rewards/accuracies": 1.0, - "rewards/chosen": 0.6554132103919983, - "rewards/margins": 12.086005210876465, - "rewards/rejected": -11.430593490600586, + "rewards/chosen": 1.1916559934616089, + "rewards/margins": 14.683520317077637, + "rewards/rejected": -13.491865158081055, "step": 5790 }, { "epoch": 2.99, "learning_rate": 1.0518263530311723e-09, - "logits/chosen": -2.7882561683654785, - "logits/rejected": -2.8440985679626465, - "logps/chosen": -204.60406494140625, - "logps/rejected": -374.4756774902344, - "loss": 0.0057, + "logits/chosen": -2.716102123260498, + "logits/rejected": -2.752955913543701, + "logps/chosen": -206.16140747070312, + "logps/rejected": -338.40191650390625, + "loss": 0.0014, "rewards/accuracies": 1.0, - "rewards/chosen": 0.47186797857284546, - "rewards/margins": 10.928264617919922, - "rewards/rejected": -10.456398010253906, + "rewards/chosen": 0.3178180456161499, + "rewards/margins": 12.861709594726562, + "rewards/rejected": -12.543892860412598, "step": 5800 }, { "epoch": 2.99, - "eval_logits/chosen": -2.7457115650177, - "eval_logits/rejected": -2.808168411254883, - "eval_logps/chosen": -267.2555847167969, - "eval_logps/rejected": -356.2176208496094, - "eval_loss": 0.6030594110488892, - "eval_rewards/accuracies": 0.8140000104904175, - "eval_rewards/chosen": -1.9575421810150146, - "eval_rewards/margins": 6.05047607421875, - "eval_rewards/rejected": -8.008018493652344, - "eval_runtime": 278.3945, - "eval_samples_per_second": 7.184, - "eval_steps_per_second": 0.449, + "eval_logits/chosen": -2.696916103363037, + "eval_logits/rejected": -2.730095386505127, + "eval_logps/chosen": -259.411865234375, + "eval_logps/rejected": -329.3108215332031, + "eval_loss": 0.4131491184234619, + "eval_rewards/accuracies": 0.8700000047683716, + "eval_rewards/chosen": -1.1597379446029663, + "eval_rewards/margins": 9.19796085357666, + "eval_rewards/rejected": -10.357698440551758, + "eval_runtime": 276.3761, + "eval_samples_per_second": 7.237, + "eval_steps_per_second": 0.452, "step": 5800 }, { "epoch": 3.0, "learning_rate": 9.562057754828839e-11, - "logits/chosen": -2.7588553428649902, - "logits/rejected": -2.836669445037842, - "logps/chosen": -241.68515014648438, - "logps/rejected": -372.9002380371094, - "loss": 0.0155, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.14204326272010803, - "rewards/margins": 10.061766624450684, - "rewards/rejected": -9.919723510742188, + "logits/chosen": -2.6807446479797363, + "logits/rejected": -2.714232921600342, + "logps/chosen": -243.63076782226562, + "logps/rejected": -308.332763671875, + "loss": 0.013, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.050807129591703415, + "rewards/margins": 12.483884811401367, + "rewards/rejected": -12.53469181060791, "step": 5810 }, { "epoch": 3.0, "step": 5811, "total_flos": 0.0, - "train_loss": 0.19806672207460788, - "train_runtime": 74526.9689, - "train_samples_per_second": 2.494, + "train_loss": 0.1539872911558545, + "train_runtime": 74041.8418, + "train_samples_per_second": 2.511, "train_steps_per_second": 0.078 } ],