diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4566 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 2.9992254066615027, + "eval_steps": 100, + "global_step": 2904, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "learning_rate": 1.7182130584192438e-09, + "logits/chosen": -2.8386430740356445, + "logits/rejected": -2.8774726390838623, + "logps/chosen": -396.7501220703125, + "logps/rejected": -306.6087951660156, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 1.718213058419244e-08, + "logits/chosen": -2.9410815238952637, + "logits/rejected": -2.9279916286468506, + "logps/chosen": -364.9696350097656, + "logps/rejected": -268.6126403808594, + "loss": 0.6943, + "rewards/accuracies": 0.4722222089767456, + "rewards/chosen": 0.005337627604603767, + "rewards/margins": 0.005309337750077248, + "rewards/rejected": 2.8289776309975423e-05, + "step": 10 + }, + { + "epoch": 0.02, + "learning_rate": 3.436426116838488e-08, + "logits/chosen": -2.930140972137451, + "logits/rejected": -2.9554715156555176, + "logps/chosen": -357.8751525878906, + "logps/rejected": -295.104736328125, + "loss": 0.6828, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.05925828218460083, + "rewards/margins": 0.02539578638970852, + "rewards/rejected": 0.03386249393224716, + "step": 20 + }, + { + "epoch": 0.03, + "learning_rate": 5.154639175257731e-08, + "logits/chosen": -2.9392919540405273, + "logits/rejected": -2.9263033866882324, + "logps/chosen": -336.83807373046875, + "logps/rejected": -283.37139892578125, + "loss": 0.6563, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.17703184485435486, + "rewards/margins": 0.07941180467605591, + "rewards/rejected": 0.09762003272771835, + "step": 30 + }, + { + "epoch": 0.04, + "learning_rate": 6.872852233676976e-08, + "logits/chosen": -2.9387967586517334, + "logits/rejected": -2.9206314086914062, + "logps/chosen": -384.8312683105469, + "logps/rejected": -306.2314453125, + "loss": 0.6442, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.3545844554901123, + "rewards/margins": 0.17298033833503723, + "rewards/rejected": 0.18160411715507507, + "step": 40 + }, + { + "epoch": 0.05, + "learning_rate": 8.59106529209622e-08, + "logits/chosen": -2.9341976642608643, + "logits/rejected": -2.901994466781616, + "logps/chosen": -324.2142639160156, + "logps/rejected": -247.2388458251953, + "loss": 0.6214, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.37454044818878174, + "rewards/margins": 0.2947521507740021, + "rewards/rejected": 0.07978831231594086, + "step": 50 + }, + { + "epoch": 0.06, + "learning_rate": 1.0309278350515462e-07, + "logits/chosen": -2.929769992828369, + "logits/rejected": -2.9284074306488037, + "logps/chosen": -358.61199951171875, + "logps/rejected": -272.9038391113281, + "loss": 0.6117, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.5510571599006653, + "rewards/margins": 0.40916723012924194, + "rewards/rejected": 0.14188989996910095, + "step": 60 + }, + { + "epoch": 0.07, + "learning_rate": 1.202749140893471e-07, + "logits/chosen": -2.9810051918029785, + "logits/rejected": -2.98490571975708, + "logps/chosen": -334.68927001953125, + "logps/rejected": -265.19873046875, + "loss": 0.6119, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.46736612915992737, + "rewards/margins": 0.40175753831863403, + "rewards/rejected": 0.06560859829187393, + "step": 70 + }, + { + "epoch": 0.08, + "learning_rate": 1.3745704467353952e-07, + "logits/chosen": -2.9579415321350098, + "logits/rejected": -2.9277639389038086, + "logps/chosen": -346.24163818359375, + "logps/rejected": -263.8692321777344, + "loss": 0.558, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4413931965827942, + "rewards/margins": 0.5552295446395874, + "rewards/rejected": -0.1138363927602768, + "step": 80 + }, + { + "epoch": 0.09, + "learning_rate": 1.5463917525773197e-07, + "logits/chosen": -2.9735989570617676, + "logits/rejected": -2.9386916160583496, + "logps/chosen": -307.0599670410156, + "logps/rejected": -263.30126953125, + "loss": 0.5294, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.45517802238464355, + "rewards/margins": 0.5852676630020142, + "rewards/rejected": -0.1300896406173706, + "step": 90 + }, + { + "epoch": 0.1, + "learning_rate": 1.718213058419244e-07, + "logits/chosen": -2.8946220874786377, + "logits/rejected": -2.891949415206909, + "logps/chosen": -353.20416259765625, + "logps/rejected": -249.47891235351562, + "loss": 0.5513, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": 0.47438564896583557, + "rewards/margins": 0.6653046607971191, + "rewards/rejected": -0.1909189671278, + "step": 100 + }, + { + "epoch": 0.1, + "eval_logits/chosen": -2.9198687076568604, + "eval_logits/rejected": -2.904863119125366, + "eval_logps/chosen": -348.2250061035156, + "eval_logps/rejected": -286.0747375488281, + "eval_loss": 0.5430884957313538, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": 0.4640864431858063, + "eval_rewards/margins": 0.6596349477767944, + "eval_rewards/rejected": -0.19554853439331055, + "eval_runtime": 499.9395, + "eval_samples_per_second": 4.0, + "eval_steps_per_second": 0.5, + "step": 100 + }, + { + "epoch": 0.11, + "learning_rate": 1.8900343642611682e-07, + "logits/chosen": -2.873088836669922, + "logits/rejected": -2.864135265350342, + "logps/chosen": -339.3306884765625, + "logps/rejected": -271.645751953125, + "loss": 0.5789, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2165219783782959, + "rewards/margins": 0.534845769405365, + "rewards/rejected": -0.31832385063171387, + "step": 110 + }, + { + "epoch": 0.12, + "learning_rate": 2.0618556701030925e-07, + "logits/chosen": -2.948822259902954, + "logits/rejected": -2.9265244007110596, + "logps/chosen": -339.6183166503906, + "logps/rejected": -278.4152526855469, + "loss": 0.55, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": 0.3460471034049988, + "rewards/margins": 0.47235360741615295, + "rewards/rejected": -0.12630648910999298, + "step": 120 + }, + { + "epoch": 0.13, + "learning_rate": 2.2336769759450173e-07, + "logits/chosen": -2.9342408180236816, + "logits/rejected": -2.939258098602295, + "logps/chosen": -329.4230651855469, + "logps/rejected": -291.20989990234375, + "loss": 0.5407, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.4995655417442322, + "rewards/margins": 0.7382161617279053, + "rewards/rejected": -0.23865056037902832, + "step": 130 + }, + { + "epoch": 0.14, + "learning_rate": 2.405498281786942e-07, + "logits/chosen": -2.9660542011260986, + "logits/rejected": -2.9834768772125244, + "logps/chosen": -349.0820007324219, + "logps/rejected": -262.83416748046875, + "loss": 0.5112, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4791792035102844, + "rewards/margins": 0.8585275411605835, + "rewards/rejected": -0.37934836745262146, + "step": 140 + }, + { + "epoch": 0.15, + "learning_rate": 2.5773195876288655e-07, + "logits/chosen": -2.9196953773498535, + "logits/rejected": -2.9103140830993652, + "logps/chosen": -347.46453857421875, + "logps/rejected": -260.7893981933594, + "loss": 0.539, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.5144405364990234, + "rewards/margins": 0.822402834892273, + "rewards/rejected": -0.3079623579978943, + "step": 150 + }, + { + "epoch": 0.17, + "learning_rate": 2.7491408934707903e-07, + "logits/chosen": -2.9616494178771973, + "logits/rejected": -2.9489588737487793, + "logps/chosen": -328.7740478515625, + "logps/rejected": -265.5461730957031, + "loss": 0.5049, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.6073015928268433, + "rewards/margins": 0.9107543230056763, + "rewards/rejected": -0.3034527003765106, + "step": 160 + }, + { + "epoch": 0.18, + "learning_rate": 2.9209621993127146e-07, + "logits/chosen": -2.9152207374572754, + "logits/rejected": -2.923574924468994, + "logps/chosen": -343.9161071777344, + "logps/rejected": -280.7509765625, + "loss": 0.5156, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.4770805239677429, + "rewards/margins": 1.0360281467437744, + "rewards/rejected": -0.5589475631713867, + "step": 170 + }, + { + "epoch": 0.19, + "learning_rate": 3.0927835051546394e-07, + "logits/chosen": -2.907759428024292, + "logits/rejected": -2.9071803092956543, + "logps/chosen": -330.72161865234375, + "logps/rejected": -265.810302734375, + "loss": 0.4979, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.33995530009269714, + "rewards/margins": 0.9378688931465149, + "rewards/rejected": -0.5979136228561401, + "step": 180 + }, + { + "epoch": 0.2, + "learning_rate": 3.2646048109965636e-07, + "logits/chosen": -2.939296245574951, + "logits/rejected": -2.9180989265441895, + "logps/chosen": -300.32318115234375, + "logps/rejected": -250.5105743408203, + "loss": 0.5614, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38950082659721375, + "rewards/margins": 0.9293573498725891, + "rewards/rejected": -0.539856493473053, + "step": 190 + }, + { + "epoch": 0.21, + "learning_rate": 3.436426116838488e-07, + "logits/chosen": -2.9912192821502686, + "logits/rejected": -2.9657769203186035, + "logps/chosen": -335.5823669433594, + "logps/rejected": -279.8517761230469, + "loss": 0.5322, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.3443685472011566, + "rewards/margins": 0.8442217111587524, + "rewards/rejected": -0.49985313415527344, + "step": 200 + }, + { + "epoch": 0.21, + "eval_logits/chosen": -2.9475507736206055, + "eval_logits/rejected": -2.9307024478912354, + "eval_logps/chosen": -348.5544128417969, + "eval_logps/rejected": -291.5714111328125, + "eval_loss": 0.5251129269599915, + "eval_rewards/accuracies": 0.7680000066757202, + "eval_rewards/chosen": 0.4311439096927643, + "eval_rewards/margins": 1.1763547658920288, + "eval_rewards/rejected": -0.7452106475830078, + "eval_runtime": 500.63, + "eval_samples_per_second": 3.995, + "eval_steps_per_second": 0.499, + "step": 200 + }, + { + "epoch": 0.22, + "learning_rate": 3.608247422680412e-07, + "logits/chosen": -2.9167776107788086, + "logits/rejected": -2.909834384918213, + "logps/chosen": -347.0169372558594, + "logps/rejected": -276.9785461425781, + "loss": 0.4708, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.6056002974510193, + "rewards/margins": 1.2408610582351685, + "rewards/rejected": -0.6352607011795044, + "step": 210 + }, + { + "epoch": 0.23, + "learning_rate": 3.7800687285223364e-07, + "logits/chosen": -2.945709705352783, + "logits/rejected": -2.9418673515319824, + "logps/chosen": -314.41168212890625, + "logps/rejected": -284.7486877441406, + "loss": 0.5639, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.4781018793582916, + "rewards/margins": 1.122393012046814, + "rewards/rejected": -0.6442912220954895, + "step": 220 + }, + { + "epoch": 0.24, + "learning_rate": 3.9518900343642607e-07, + "logits/chosen": -2.9685254096984863, + "logits/rejected": -2.9509358406066895, + "logps/chosen": -348.00921630859375, + "logps/rejected": -302.19171142578125, + "loss": 0.5851, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.428355872631073, + "rewards/margins": 1.09479820728302, + "rewards/rejected": -0.666442334651947, + "step": 230 + }, + { + "epoch": 0.25, + "learning_rate": 4.123711340206185e-07, + "logits/chosen": -3.018501043319702, + "logits/rejected": -3.016463041305542, + "logps/chosen": -315.842529296875, + "logps/rejected": -263.87420654296875, + "loss": 0.4913, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.3246752619743347, + "rewards/margins": 1.0819722414016724, + "rewards/rejected": -0.7572969198226929, + "step": 240 + }, + { + "epoch": 0.26, + "learning_rate": 4.2955326460481097e-07, + "logits/chosen": -3.021206855773926, + "logits/rejected": -3.0122694969177246, + "logps/chosen": -387.8692626953125, + "logps/rejected": -301.49224853515625, + "loss": 0.5462, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2839759886264801, + "rewards/margins": 1.1730202436447144, + "rewards/rejected": -0.8890441656112671, + "step": 250 + }, + { + "epoch": 0.27, + "learning_rate": 4.4673539518900345e-07, + "logits/chosen": -3.0731797218322754, + "logits/rejected": -3.0552587509155273, + "logps/chosen": -328.5102844238281, + "logps/rejected": -269.9062194824219, + "loss": 0.6421, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.0661497563123703, + "rewards/margins": 0.8637269735336304, + "rewards/rejected": -0.7975772023200989, + "step": 260 + }, + { + "epoch": 0.28, + "learning_rate": 4.639175257731959e-07, + "logits/chosen": -3.074122905731201, + "logits/rejected": -3.061890125274658, + "logps/chosen": -315.14642333984375, + "logps/rejected": -286.4360046386719, + "loss": 0.5662, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.12611123919487, + "rewards/margins": 0.9487603902816772, + "rewards/rejected": -0.8226491808891296, + "step": 270 + }, + { + "epoch": 0.29, + "learning_rate": 4.810996563573884e-07, + "logits/chosen": -2.9959123134613037, + "logits/rejected": -2.9872357845306396, + "logps/chosen": -372.4709167480469, + "logps/rejected": -292.8404541015625, + "loss": 0.6315, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.5914579629898071, + "rewards/margins": 1.3111344575881958, + "rewards/rejected": -0.7196764945983887, + "step": 280 + }, + { + "epoch": 0.3, + "learning_rate": 4.982817869415807e-07, + "logits/chosen": -3.0343637466430664, + "logits/rejected": -2.992107391357422, + "logps/chosen": -342.8162536621094, + "logps/rejected": -284.862548828125, + "loss": 0.569, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.5013134479522705, + "rewards/margins": 1.26241135597229, + "rewards/rejected": -0.7610978484153748, + "step": 290 + }, + { + "epoch": 0.31, + "learning_rate": 4.982778415614236e-07, + "logits/chosen": -2.9965505599975586, + "logits/rejected": -3.0124495029449463, + "logps/chosen": -305.66571044921875, + "logps/rejected": -315.95111083984375, + "loss": 0.602, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.5931826829910278, + "rewards/margins": 1.2151721715927124, + "rewards/rejected": -0.6219894886016846, + "step": 300 + }, + { + "epoch": 0.31, + "eval_logits/chosen": -2.99521541595459, + "eval_logits/rejected": -2.96345853805542, + "eval_logps/chosen": -347.50177001953125, + "eval_logps/rejected": -290.2912902832031, + "eval_loss": 0.5439262986183167, + "eval_rewards/accuracies": 0.7440000176429749, + "eval_rewards/chosen": 0.5364080667495728, + "eval_rewards/margins": 1.1536086797714233, + "eval_rewards/rejected": -0.6172006130218506, + "eval_runtime": 500.8971, + "eval_samples_per_second": 3.993, + "eval_steps_per_second": 0.499, + "step": 300 + }, + { + "epoch": 0.32, + "learning_rate": 4.963643321852277e-07, + "logits/chosen": -3.0448598861694336, + "logits/rejected": -3.017517566680908, + "logps/chosen": -353.1573486328125, + "logps/rejected": -284.19000244140625, + "loss": 0.5839, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.37520498037338257, + "rewards/margins": 0.9747766256332397, + "rewards/rejected": -0.5995717644691467, + "step": 310 + }, + { + "epoch": 0.33, + "learning_rate": 4.944508228090318e-07, + "logits/chosen": -3.0423622131347656, + "logits/rejected": -2.9570577144622803, + "logps/chosen": -302.97955322265625, + "logps/rejected": -245.19552612304688, + "loss": 0.5714, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.5704945921897888, + "rewards/margins": 0.9897972345352173, + "rewards/rejected": -0.41930264234542847, + "step": 320 + }, + { + "epoch": 0.34, + "learning_rate": 4.925373134328357e-07, + "logits/chosen": -3.0704784393310547, + "logits/rejected": -3.0522284507751465, + "logps/chosen": -336.61260986328125, + "logps/rejected": -285.2626037597656, + "loss": 0.5327, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.5059818029403687, + "rewards/margins": 1.1841099262237549, + "rewards/rejected": -0.6781281232833862, + "step": 330 + }, + { + "epoch": 0.35, + "learning_rate": 4.906238040566398e-07, + "logits/chosen": -2.9792017936706543, + "logits/rejected": -3.0039639472961426, + "logps/chosen": -294.2068786621094, + "logps/rejected": -279.0820617675781, + "loss": 0.6637, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.2278396338224411, + "rewards/margins": 0.7685919404029846, + "rewards/rejected": -0.5407522916793823, + "step": 340 + }, + { + "epoch": 0.36, + "learning_rate": 4.887102946804438e-07, + "logits/chosen": -3.072432279586792, + "logits/rejected": -3.047440528869629, + "logps/chosen": -356.6179504394531, + "logps/rejected": -278.54400634765625, + "loss": 0.537, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.7054955959320068, + "rewards/margins": 1.3675787448883057, + "rewards/rejected": -0.6620832085609436, + "step": 350 + }, + { + "epoch": 0.37, + "learning_rate": 4.867967853042479e-07, + "logits/chosen": -3.057285785675049, + "logits/rejected": -3.0351486206054688, + "logps/chosen": -301.95526123046875, + "logps/rejected": -284.12005615234375, + "loss": 0.5121, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.5215950012207031, + "rewards/margins": 1.3286265134811401, + "rewards/rejected": -0.807031512260437, + "step": 360 + }, + { + "epoch": 0.38, + "learning_rate": 4.84883275928052e-07, + "logits/chosen": -3.037200450897217, + "logits/rejected": -3.044207811355591, + "logps/chosen": -333.80474853515625, + "logps/rejected": -304.6210021972656, + "loss": 0.5898, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.2941075265407562, + "rewards/margins": 1.1119602918624878, + "rewards/rejected": -0.8178526759147644, + "step": 370 + }, + { + "epoch": 0.39, + "learning_rate": 4.82969766551856e-07, + "logits/chosen": -3.054816484451294, + "logits/rejected": -3.0310795307159424, + "logps/chosen": -367.0569152832031, + "logps/rejected": -301.362060546875, + "loss": 0.5764, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.4226422905921936, + "rewards/margins": 1.428713321685791, + "rewards/rejected": -1.0060709714889526, + "step": 380 + }, + { + "epoch": 0.4, + "learning_rate": 4.810562571756601e-07, + "logits/chosen": -2.941606044769287, + "logits/rejected": -2.9353060722351074, + "logps/chosen": -364.6161193847656, + "logps/rejected": -285.13714599609375, + "loss": 0.5958, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.369827002286911, + "rewards/margins": 1.4904773235321045, + "rewards/rejected": -1.1206501722335815, + "step": 390 + }, + { + "epoch": 0.41, + "learning_rate": 4.791427477994642e-07, + "logits/chosen": -2.9758362770080566, + "logits/rejected": -2.9436521530151367, + "logps/chosen": -341.735107421875, + "logps/rejected": -297.5789489746094, + "loss": 0.5809, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.364463746547699, + "rewards/margins": 1.448970913887024, + "rewards/rejected": -1.0845072269439697, + "step": 400 + }, + { + "epoch": 0.41, + "eval_logits/chosen": -2.9359750747680664, + "eval_logits/rejected": -2.9205198287963867, + "eval_logps/chosen": -349.02520751953125, + "eval_logps/rejected": -293.040771484375, + "eval_loss": 0.5436112284660339, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": 0.38406580686569214, + "eval_rewards/margins": 1.2762165069580078, + "eval_rewards/rejected": -0.89215087890625, + "eval_runtime": 499.6663, + "eval_samples_per_second": 4.003, + "eval_steps_per_second": 0.5, + "step": 400 + }, + { + "epoch": 0.42, + "learning_rate": 4.772292384232682e-07, + "logits/chosen": -2.952409029006958, + "logits/rejected": -2.9130635261535645, + "logps/chosen": -338.5810546875, + "logps/rejected": -306.2530517578125, + "loss": 0.746, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": -0.08780597150325775, + "rewards/margins": 0.7161486148834229, + "rewards/rejected": -0.8039544820785522, + "step": 410 + }, + { + "epoch": 0.43, + "learning_rate": 4.753157290470723e-07, + "logits/chosen": -2.995166301727295, + "logits/rejected": -2.9763875007629395, + "logps/chosen": -342.8363342285156, + "logps/rejected": -276.787353515625, + "loss": 0.561, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.18220452964305878, + "rewards/margins": 0.9985648989677429, + "rewards/rejected": -0.816360354423523, + "step": 420 + }, + { + "epoch": 0.44, + "learning_rate": 4.7340221967087635e-07, + "logits/chosen": -3.0134756565093994, + "logits/rejected": -3.0398764610290527, + "logps/chosen": -321.25738525390625, + "logps/rejected": -267.5379943847656, + "loss": 0.5616, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.4658970236778259, + "rewards/margins": 1.3995509147644043, + "rewards/rejected": -0.9336539506912231, + "step": 430 + }, + { + "epoch": 0.45, + "learning_rate": 4.714887102946804e-07, + "logits/chosen": -3.0466179847717285, + "logits/rejected": -3.0337159633636475, + "logps/chosen": -326.5643005371094, + "logps/rejected": -267.01751708984375, + "loss": 0.5292, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.3320990800857544, + "rewards/margins": 1.2292256355285645, + "rewards/rejected": -0.8971264958381653, + "step": 440 + }, + { + "epoch": 0.46, + "learning_rate": 4.6957520091848447e-07, + "logits/chosen": -3.0742642879486084, + "logits/rejected": -3.0417568683624268, + "logps/chosen": -298.84161376953125, + "logps/rejected": -281.374267578125, + "loss": 0.5816, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.23377075791358948, + "rewards/margins": 0.9984762072563171, + "rewards/rejected": -0.76470547914505, + "step": 450 + }, + { + "epoch": 0.48, + "learning_rate": 4.6766169154228853e-07, + "logits/chosen": -3.01784348487854, + "logits/rejected": -3.042041540145874, + "logps/chosen": -331.4616394042969, + "logps/rejected": -285.1472473144531, + "loss": 0.557, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6183053851127625, + "rewards/margins": 1.2917571067810059, + "rewards/rejected": -0.6734516620635986, + "step": 460 + }, + { + "epoch": 0.49, + "learning_rate": 4.657481821660926e-07, + "logits/chosen": -3.0028042793273926, + "logits/rejected": -2.9753506183624268, + "logps/chosen": -312.3541564941406, + "logps/rejected": -250.565673828125, + "loss": 0.5482, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.26564690470695496, + "rewards/margins": 1.1742582321166992, + "rewards/rejected": -0.9086114168167114, + "step": 470 + }, + { + "epoch": 0.5, + "learning_rate": 4.6383467278989666e-07, + "logits/chosen": -3.0283827781677246, + "logits/rejected": -3.046517848968506, + "logps/chosen": -315.6674499511719, + "logps/rejected": -280.6795349121094, + "loss": 0.5289, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.547839343547821, + "rewards/margins": 1.7104412317276, + "rewards/rejected": -1.1626019477844238, + "step": 480 + }, + { + "epoch": 0.51, + "learning_rate": 4.6192116341370067e-07, + "logits/chosen": -2.985366106033325, + "logits/rejected": -2.969849109649658, + "logps/chosen": -330.6661682128906, + "logps/rejected": -275.83099365234375, + "loss": 0.5242, + "rewards/accuracies": 0.793749988079071, + "rewards/chosen": 0.4285426139831543, + "rewards/margins": 1.5095882415771484, + "rewards/rejected": -1.0810457468032837, + "step": 490 + }, + { + "epoch": 0.52, + "learning_rate": 4.6000765403750473e-07, + "logits/chosen": -3.0527420043945312, + "logits/rejected": -3.0406336784362793, + "logps/chosen": -299.01055908203125, + "logps/rejected": -242.08358764648438, + "loss": 0.5164, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.38268548250198364, + "rewards/margins": 1.2666680812835693, + "rewards/rejected": -0.8839825391769409, + "step": 500 + }, + { + "epoch": 0.52, + "eval_logits/chosen": -3.015965223312378, + "eval_logits/rejected": -3.0117154121398926, + "eval_logps/chosen": -349.8603210449219, + "eval_logps/rejected": -295.5238952636719, + "eval_loss": 0.5405718088150024, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": 0.3005577325820923, + "eval_rewards/margins": 1.4410209655761719, + "eval_rewards/rejected": -1.1404632329940796, + "eval_runtime": 499.3309, + "eval_samples_per_second": 4.005, + "eval_steps_per_second": 0.501, + "step": 500 + }, + { + "epoch": 0.53, + "learning_rate": 4.580941446613088e-07, + "logits/chosen": -3.002946376800537, + "logits/rejected": -2.97822642326355, + "logps/chosen": -320.47601318359375, + "logps/rejected": -279.7250671386719, + "loss": 0.5376, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21459856629371643, + "rewards/margins": 1.4735734462738037, + "rewards/rejected": -1.2589749097824097, + "step": 510 + }, + { + "epoch": 0.54, + "learning_rate": 4.5618063528511285e-07, + "logits/chosen": -3.031226634979248, + "logits/rejected": -3.0124049186706543, + "logps/chosen": -306.7613830566406, + "logps/rejected": -264.15411376953125, + "loss": 0.5679, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": 0.32867926359176636, + "rewards/margins": 1.4985711574554443, + "rewards/rejected": -1.1698918342590332, + "step": 520 + }, + { + "epoch": 0.55, + "learning_rate": 4.542671259089169e-07, + "logits/chosen": -2.965167284011841, + "logits/rejected": -2.938575267791748, + "logps/chosen": -300.1754455566406, + "logps/rejected": -275.5013427734375, + "loss": 0.5973, + "rewards/accuracies": 0.6937500238418579, + "rewards/chosen": -0.07723500579595566, + "rewards/margins": 0.9571349024772644, + "rewards/rejected": -1.034369945526123, + "step": 530 + }, + { + "epoch": 0.56, + "learning_rate": 4.52353616532721e-07, + "logits/chosen": -3.0364766120910645, + "logits/rejected": -3.031278610229492, + "logps/chosen": -347.5738220214844, + "logps/rejected": -286.0475769042969, + "loss": 0.559, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.6112526059150696, + "rewards/margins": 1.4097859859466553, + "rewards/rejected": -0.79853355884552, + "step": 540 + }, + { + "epoch": 0.57, + "learning_rate": 4.5044010715652504e-07, + "logits/chosen": -2.958436965942383, + "logits/rejected": -2.957434892654419, + "logps/chosen": -339.9021301269531, + "logps/rejected": -298.78729248046875, + "loss": 0.535, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.24418941140174866, + "rewards/margins": 1.1881563663482666, + "rewards/rejected": -0.9439669847488403, + "step": 550 + }, + { + "epoch": 0.58, + "learning_rate": 4.485265977803291e-07, + "logits/chosen": -2.9755353927612305, + "logits/rejected": -2.9734432697296143, + "logps/chosen": -338.18255615234375, + "logps/rejected": -294.8335876464844, + "loss": 0.5625, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.1414714902639389, + "rewards/margins": 1.3819502592086792, + "rewards/rejected": -1.240478754043579, + "step": 560 + }, + { + "epoch": 0.59, + "learning_rate": 4.4661308840413316e-07, + "logits/chosen": -2.8903489112854004, + "logits/rejected": -2.886591672897339, + "logps/chosen": -337.5680236816406, + "logps/rejected": -308.0293884277344, + "loss": 0.5135, + "rewards/accuracies": 0.7562500238418579, + "rewards/chosen": -0.011459055356681347, + "rewards/margins": 1.3413360118865967, + "rewards/rejected": -1.3527950048446655, + "step": 570 + }, + { + "epoch": 0.6, + "learning_rate": 4.446995790279372e-07, + "logits/chosen": -2.918245792388916, + "logits/rejected": -2.9179019927978516, + "logps/chosen": -331.99859619140625, + "logps/rejected": -315.8232421875, + "loss": 0.5671, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.17509713768959045, + "rewards/margins": 1.5391473770141602, + "rewards/rejected": -1.3640501499176025, + "step": 580 + }, + { + "epoch": 0.61, + "learning_rate": 4.4278606965174123e-07, + "logits/chosen": -2.947676420211792, + "logits/rejected": -2.9416377544403076, + "logps/chosen": -324.63861083984375, + "logps/rejected": -275.0785217285156, + "loss": 0.5666, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.015921253710985184, + "rewards/margins": 1.2579394578933716, + "rewards/rejected": -1.242018222808838, + "step": 590 + }, + { + "epoch": 0.62, + "learning_rate": 4.408725602755453e-07, + "logits/chosen": -2.9947569370269775, + "logits/rejected": -2.9961860179901123, + "logps/chosen": -284.56951904296875, + "logps/rejected": -242.5293426513672, + "loss": 0.5957, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.21141913533210754, + "rewards/margins": 0.8547714948654175, + "rewards/rejected": -1.066190481185913, + "step": 600 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -2.9458935260772705, + "eval_logits/rejected": -2.935307025909424, + "eval_logps/chosen": -350.91400146484375, + "eval_logps/rejected": -294.41485595703125, + "eval_loss": 0.5336324572563171, + "eval_rewards/accuracies": 0.7379999756813049, + "eval_rewards/chosen": 0.1951846182346344, + "eval_rewards/margins": 1.224743366241455, + "eval_rewards/rejected": -1.0295586585998535, + "eval_runtime": 500.1685, + "eval_samples_per_second": 3.999, + "eval_steps_per_second": 0.5, + "step": 600 + }, + { + "epoch": 0.63, + "learning_rate": 4.3895905089934936e-07, + "logits/chosen": -2.939690113067627, + "logits/rejected": -2.9660871028900146, + "logps/chosen": -327.99853515625, + "logps/rejected": -304.2380065917969, + "loss": 0.5408, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.16417217254638672, + "rewards/margins": 1.4821269512176514, + "rewards/rejected": -1.3179547786712646, + "step": 610 + }, + { + "epoch": 0.64, + "learning_rate": 4.370455415231534e-07, + "logits/chosen": -2.9834229946136475, + "logits/rejected": -2.985764980316162, + "logps/chosen": -375.3091125488281, + "logps/rejected": -344.99591064453125, + "loss": 0.5471, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.2328425645828247, + "rewards/margins": 1.4427080154418945, + "rewards/rejected": -1.2098654508590698, + "step": 620 + }, + { + "epoch": 0.65, + "learning_rate": 4.351320321469575e-07, + "logits/chosen": -2.9586803913116455, + "logits/rejected": -2.9774413108825684, + "logps/chosen": -365.9512023925781, + "logps/rejected": -294.7960205078125, + "loss": 0.6138, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.1919662207365036, + "rewards/margins": 1.4944833517074585, + "rewards/rejected": -1.3025171756744385, + "step": 630 + }, + { + "epoch": 0.66, + "learning_rate": 4.3321852277076154e-07, + "logits/chosen": -3.0369629859924316, + "logits/rejected": -3.0328054428100586, + "logps/chosen": -342.85235595703125, + "logps/rejected": -305.83343505859375, + "loss": 0.5368, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": -0.0010919570922851562, + "rewards/margins": 1.265241265296936, + "rewards/rejected": -1.2663332223892212, + "step": 640 + }, + { + "epoch": 0.67, + "learning_rate": 4.313050133945656e-07, + "logits/chosen": -3.0272622108459473, + "logits/rejected": -3.0403847694396973, + "logps/chosen": -333.6284484863281, + "logps/rejected": -284.5811462402344, + "loss": 0.5111, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.32994264364242554, + "rewards/margins": 1.4819453954696655, + "rewards/rejected": -1.1520028114318848, + "step": 650 + }, + { + "epoch": 0.68, + "learning_rate": 4.2939150401836967e-07, + "logits/chosen": -3.0416343212127686, + "logits/rejected": -3.047837734222412, + "logps/chosen": -329.989990234375, + "logps/rejected": -305.66046142578125, + "loss": 0.5253, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.07132011651992798, + "rewards/margins": 1.4362289905548096, + "rewards/rejected": -1.3649089336395264, + "step": 660 + }, + { + "epoch": 0.69, + "learning_rate": 4.2747799464217373e-07, + "logits/chosen": -2.998131036758423, + "logits/rejected": -2.992091417312622, + "logps/chosen": -378.5470275878906, + "logps/rejected": -282.40570068359375, + "loss": 0.5477, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.08499707281589508, + "rewards/margins": 1.1583983898162842, + "rewards/rejected": -1.0734012126922607, + "step": 670 + }, + { + "epoch": 0.7, + "learning_rate": 4.255644852659778e-07, + "logits/chosen": -2.9488022327423096, + "logits/rejected": -2.993544816970825, + "logps/chosen": -360.3157653808594, + "logps/rejected": -310.53826904296875, + "loss": 0.6413, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.2993057370185852, + "rewards/margins": 1.025062918663025, + "rewards/rejected": -0.7257571220397949, + "step": 680 + }, + { + "epoch": 0.71, + "learning_rate": 4.236509758897818e-07, + "logits/chosen": -2.9860472679138184, + "logits/rejected": -3.0196166038513184, + "logps/chosen": -314.6225280761719, + "logps/rejected": -263.229248046875, + "loss": 0.5575, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": 0.18419621884822845, + "rewards/margins": 1.147241473197937, + "rewards/rejected": -0.9630452990531921, + "step": 690 + }, + { + "epoch": 0.72, + "learning_rate": 4.2173746651358586e-07, + "logits/chosen": -2.97763729095459, + "logits/rejected": -2.972525119781494, + "logps/chosen": -351.1864013671875, + "logps/rejected": -286.4507751464844, + "loss": 0.6516, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 1.0068714800581802e-05, + "rewards/margins": 0.9209893345832825, + "rewards/rejected": -0.9209792017936707, + "step": 700 + }, + { + "epoch": 0.72, + "eval_logits/chosen": -2.985739231109619, + "eval_logits/rejected": -2.9889140129089355, + "eval_logps/chosen": -350.6415710449219, + "eval_logps/rejected": -294.66180419921875, + "eval_loss": 0.5292132496833801, + "eval_rewards/accuracies": 0.7519999742507935, + "eval_rewards/chosen": 0.22242723405361176, + "eval_rewards/margins": 1.2766809463500977, + "eval_rewards/rejected": -1.0542538166046143, + "eval_runtime": 499.164, + "eval_samples_per_second": 4.007, + "eval_steps_per_second": 0.501, + "step": 700 + }, + { + "epoch": 0.73, + "learning_rate": 4.198239571373899e-07, + "logits/chosen": -2.9942946434020996, + "logits/rejected": -2.9884631633758545, + "logps/chosen": -320.36328125, + "logps/rejected": -278.8677062988281, + "loss": 0.6297, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20690801739692688, + "rewards/margins": 1.0582275390625, + "rewards/rejected": -0.8513194918632507, + "step": 710 + }, + { + "epoch": 0.74, + "learning_rate": 4.17910447761194e-07, + "logits/chosen": -2.9798600673675537, + "logits/rejected": -2.9808664321899414, + "logps/chosen": -346.0234680175781, + "logps/rejected": -326.8348083496094, + "loss": 0.6079, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.026054318994283676, + "rewards/margins": 1.1562727689743042, + "rewards/rejected": -1.1302186250686646, + "step": 720 + }, + { + "epoch": 0.75, + "learning_rate": 4.1599693838499805e-07, + "logits/chosen": -2.9801442623138428, + "logits/rejected": -2.9823849201202393, + "logps/chosen": -331.5793151855469, + "logps/rejected": -271.0641174316406, + "loss": 0.569, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.46656933426856995, + "rewards/margins": 1.4055955410003662, + "rewards/rejected": -0.9390263557434082, + "step": 730 + }, + { + "epoch": 0.76, + "learning_rate": 4.140834290088021e-07, + "logits/chosen": -2.8921711444854736, + "logits/rejected": -2.893559694290161, + "logps/chosen": -353.5256652832031, + "logps/rejected": -315.2562561035156, + "loss": 0.5244, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.3240983784198761, + "rewards/margins": 1.932403326034546, + "rewards/rejected": -1.6083049774169922, + "step": 740 + }, + { + "epoch": 0.77, + "learning_rate": 4.121699196326062e-07, + "logits/chosen": -2.909632444381714, + "logits/rejected": -2.9216995239257812, + "logps/chosen": -324.0218200683594, + "logps/rejected": -261.0860900878906, + "loss": 0.515, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.3732963800430298, + "rewards/margins": 1.5582427978515625, + "rewards/rejected": -1.1849465370178223, + "step": 750 + }, + { + "epoch": 0.78, + "learning_rate": 4.1025641025641024e-07, + "logits/chosen": -2.954899311065674, + "logits/rejected": -2.949449062347412, + "logps/chosen": -320.23431396484375, + "logps/rejected": -323.3485412597656, + "loss": 0.5455, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.34602198004722595, + "rewards/margins": 1.3542587757110596, + "rewards/rejected": -1.0082366466522217, + "step": 760 + }, + { + "epoch": 0.8, + "learning_rate": 4.083429008802143e-07, + "logits/chosen": -2.9397459030151367, + "logits/rejected": -2.958151340484619, + "logps/chosen": -340.77081298828125, + "logps/rejected": -300.03826904296875, + "loss": 0.5168, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.03096119686961174, + "rewards/margins": 1.257548213005066, + "rewards/rejected": -1.226586937904358, + "step": 770 + }, + { + "epoch": 0.81, + "learning_rate": 4.0642939150401836e-07, + "logits/chosen": -2.9671478271484375, + "logits/rejected": -2.9739999771118164, + "logps/chosen": -326.27630615234375, + "logps/rejected": -293.6321716308594, + "loss": 0.5292, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": 0.1696806699037552, + "rewards/margins": 1.2981321811676025, + "rewards/rejected": -1.1284515857696533, + "step": 780 + }, + { + "epoch": 0.82, + "learning_rate": 4.0451588212782237e-07, + "logits/chosen": -2.975926637649536, + "logits/rejected": -2.988398551940918, + "logps/chosen": -306.8700256347656, + "logps/rejected": -270.39520263671875, + "loss": 0.5052, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.08689726889133453, + "rewards/margins": 1.731827735900879, + "rewards/rejected": -1.6449304819107056, + "step": 790 + }, + { + "epoch": 0.83, + "learning_rate": 4.0260237275162643e-07, + "logits/chosen": -2.912811279296875, + "logits/rejected": -2.917335271835327, + "logps/chosen": -326.623291015625, + "logps/rejected": -297.1636047363281, + "loss": 0.5353, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.33763498067855835, + "rewards/margins": 1.476910948753357, + "rewards/rejected": -1.1392759084701538, + "step": 800 + }, + { + "epoch": 0.83, + "eval_logits/chosen": -2.9171199798583984, + "eval_logits/rejected": -2.9115679264068604, + "eval_logps/chosen": -351.76287841796875, + "eval_logps/rejected": -298.2225341796875, + "eval_loss": 0.5144525766372681, + "eval_rewards/accuracies": 0.7559999823570251, + "eval_rewards/chosen": 0.11029549688100815, + "eval_rewards/margins": 1.5206220149993896, + "eval_rewards/rejected": -1.4103264808654785, + "eval_runtime": 500.3685, + "eval_samples_per_second": 3.997, + "eval_steps_per_second": 0.5, + "step": 800 + }, + { + "epoch": 0.84, + "learning_rate": 4.006888633754305e-07, + "logits/chosen": -2.9584765434265137, + "logits/rejected": -2.9493420124053955, + "logps/chosen": -344.96832275390625, + "logps/rejected": -291.515380859375, + "loss": 0.4904, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.19502875208854675, + "rewards/margins": 1.6929543018341064, + "rewards/rejected": -1.4979256391525269, + "step": 810 + }, + { + "epoch": 0.85, + "learning_rate": 3.9877535399923456e-07, + "logits/chosen": -2.965297222137451, + "logits/rejected": -2.9511642456054688, + "logps/chosen": -325.5323181152344, + "logps/rejected": -296.2393493652344, + "loss": 0.5431, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": -0.10147881507873535, + "rewards/margins": 1.3891279697418213, + "rewards/rejected": -1.490606665611267, + "step": 820 + }, + { + "epoch": 0.86, + "learning_rate": 3.968618446230386e-07, + "logits/chosen": -2.9831089973449707, + "logits/rejected": -2.988027572631836, + "logps/chosen": -365.47564697265625, + "logps/rejected": -278.7037048339844, + "loss": 0.556, + "rewards/accuracies": 0.831250011920929, + "rewards/chosen": 0.29974445700645447, + "rewards/margins": 1.7932666540145874, + "rewards/rejected": -1.4935224056243896, + "step": 830 + }, + { + "epoch": 0.87, + "learning_rate": 3.949483352468427e-07, + "logits/chosen": -2.9965195655822754, + "logits/rejected": -3.0119361877441406, + "logps/chosen": -310.4830627441406, + "logps/rejected": -278.9137878417969, + "loss": 0.5699, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": -0.004600034561008215, + "rewards/margins": 1.4109219312667847, + "rewards/rejected": -1.4155219793319702, + "step": 840 + }, + { + "epoch": 0.88, + "learning_rate": 3.9303482587064674e-07, + "logits/chosen": -3.0153908729553223, + "logits/rejected": -3.0218112468719482, + "logps/chosen": -369.01336669921875, + "logps/rejected": -302.21661376953125, + "loss": 0.5626, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.07364175468683243, + "rewards/margins": 1.458538293838501, + "rewards/rejected": -1.3848967552185059, + "step": 850 + }, + { + "epoch": 0.89, + "learning_rate": 3.911213164944508e-07, + "logits/chosen": -2.9755771160125732, + "logits/rejected": -2.9671168327331543, + "logps/chosen": -360.69061279296875, + "logps/rejected": -291.0027160644531, + "loss": 0.5702, + "rewards/accuracies": 0.706250011920929, + "rewards/chosen": -0.19498476386070251, + "rewards/margins": 1.1925767660140991, + "rewards/rejected": -1.3875614404678345, + "step": 860 + }, + { + "epoch": 0.9, + "learning_rate": 3.8920780711825487e-07, + "logits/chosen": -3.0124592781066895, + "logits/rejected": -3.0058891773223877, + "logps/chosen": -362.83624267578125, + "logps/rejected": -305.2490539550781, + "loss": 0.5918, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.20124521851539612, + "rewards/margins": 1.2762418985366821, + "rewards/rejected": -1.0749967098236084, + "step": 870 + }, + { + "epoch": 0.91, + "learning_rate": 3.8729429774205893e-07, + "logits/chosen": -2.963660478591919, + "logits/rejected": -3.0102293491363525, + "logps/chosen": -366.26568603515625, + "logps/rejected": -289.0752258300781, + "loss": 0.5749, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.3400164246559143, + "rewards/margins": 1.4809852838516235, + "rewards/rejected": -1.140968918800354, + "step": 880 + }, + { + "epoch": 0.92, + "learning_rate": 3.8538078836586294e-07, + "logits/chosen": -2.9732441902160645, + "logits/rejected": -2.97709321975708, + "logps/chosen": -321.77056884765625, + "logps/rejected": -294.64068603515625, + "loss": 0.5648, + "rewards/accuracies": 0.7437499761581421, + "rewards/chosen": 0.07467867434024811, + "rewards/margins": 1.4241504669189453, + "rewards/rejected": -1.3494718074798584, + "step": 890 + }, + { + "epoch": 0.93, + "learning_rate": 3.83467278989667e-07, + "logits/chosen": -2.9654107093811035, + "logits/rejected": -2.946199417114258, + "logps/chosen": -333.27496337890625, + "logps/rejected": -261.41510009765625, + "loss": 0.5293, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.17271609604358673, + "rewards/margins": 1.3218122720718384, + "rewards/rejected": -1.149096131324768, + "step": 900 + }, + { + "epoch": 0.93, + "eval_logits/chosen": -2.953744649887085, + "eval_logits/rejected": -2.946305990219116, + "eval_logps/chosen": -351.4819030761719, + "eval_logps/rejected": -297.2767639160156, + "eval_loss": 0.5146499872207642, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": 0.13839714229106903, + "eval_rewards/margins": 1.4541445970535278, + "eval_rewards/rejected": -1.3157474994659424, + "eval_runtime": 500.2192, + "eval_samples_per_second": 3.998, + "eval_steps_per_second": 0.5, + "step": 900 + }, + { + "epoch": 0.94, + "learning_rate": 3.8155376961347106e-07, + "logits/chosen": -2.9613089561462402, + "logits/rejected": -2.9708240032196045, + "logps/chosen": -272.4183044433594, + "logps/rejected": -238.2782440185547, + "loss": 0.5416, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.04612383991479874, + "rewards/margins": 1.3310301303863525, + "rewards/rejected": -1.2849063873291016, + "step": 910 + }, + { + "epoch": 0.95, + "learning_rate": 3.796402602372751e-07, + "logits/chosen": -2.9403462409973145, + "logits/rejected": -2.9374024868011475, + "logps/chosen": -331.61199951171875, + "logps/rejected": -297.79296875, + "loss": 0.5002, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.15933088958263397, + "rewards/margins": 1.321645975112915, + "rewards/rejected": -1.162315011024475, + "step": 920 + }, + { + "epoch": 0.96, + "learning_rate": 3.777267508610792e-07, + "logits/chosen": -2.8792290687561035, + "logits/rejected": -2.904510498046875, + "logps/chosen": -337.1361389160156, + "logps/rejected": -294.4916687011719, + "loss": 0.5735, + "rewards/accuracies": 0.768750011920929, + "rewards/chosen": -0.10670886188745499, + "rewards/margins": 1.252929449081421, + "rewards/rejected": -1.3596383333206177, + "step": 930 + }, + { + "epoch": 0.97, + "learning_rate": 3.7581324148488325e-07, + "logits/chosen": -2.9526283740997314, + "logits/rejected": -2.936981201171875, + "logps/chosen": -384.3909606933594, + "logps/rejected": -281.7669677734375, + "loss": 0.5667, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.11869187653064728, + "rewards/margins": 1.0847665071487427, + "rewards/rejected": -1.203458309173584, + "step": 940 + }, + { + "epoch": 0.98, + "learning_rate": 3.738997321086873e-07, + "logits/chosen": -2.9670538902282715, + "logits/rejected": -2.9704997539520264, + "logps/chosen": -334.4848937988281, + "logps/rejected": -277.0538330078125, + "loss": 0.5199, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.020179124549031258, + "rewards/margins": 1.3067853450775146, + "rewards/rejected": -1.2866061925888062, + "step": 950 + }, + { + "epoch": 0.99, + "learning_rate": 3.7198622273249137e-07, + "logits/chosen": -2.955815076828003, + "logits/rejected": -2.9407596588134766, + "logps/chosen": -342.2349548339844, + "logps/rejected": -290.681396484375, + "loss": 0.5393, + "rewards/accuracies": 0.731249988079071, + "rewards/chosen": 0.006191739346832037, + "rewards/margins": 1.012204885482788, + "rewards/rejected": -1.0060131549835205, + "step": 960 + }, + { + "epoch": 1.0, + "learning_rate": 3.7007271335629544e-07, + "logits/chosen": -2.9522531032562256, + "logits/rejected": -2.938239097595215, + "logps/chosen": -312.70745849609375, + "logps/rejected": -283.48223876953125, + "loss": 0.4885, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.21793344616889954, + "rewards/margins": 1.9288184642791748, + "rewards/rejected": -1.7108850479125977, + "step": 970 + }, + { + "epoch": 1.01, + "learning_rate": 3.681592039800995e-07, + "logits/chosen": -2.943690776824951, + "logits/rejected": -2.9434661865234375, + "logps/chosen": -276.56475830078125, + "logps/rejected": -279.9871826171875, + "loss": 0.1281, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.4070587158203125, + "rewards/margins": 4.2252960205078125, + "rewards/rejected": -2.818237543106079, + "step": 980 + }, + { + "epoch": 1.02, + "learning_rate": 3.662456946039035e-07, + "logits/chosen": -2.945516586303711, + "logits/rejected": -2.937763214111328, + "logps/chosen": -294.52215576171875, + "logps/rejected": -295.2162170410156, + "loss": 0.1193, + "rewards/accuracies": 0.918749988079071, + "rewards/chosen": 1.4316601753234863, + "rewards/margins": 4.2783918380737305, + "rewards/rejected": -2.846731185913086, + "step": 990 + }, + { + "epoch": 1.03, + "learning_rate": 3.6433218522770757e-07, + "logits/chosen": -2.846494436264038, + "logits/rejected": -2.858985424041748, + "logps/chosen": -310.45452880859375, + "logps/rejected": -307.7423400878906, + "loss": 0.1078, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 1.7381668090820312, + "rewards/margins": 4.478921890258789, + "rewards/rejected": -2.740755081176758, + "step": 1000 + }, + { + "epoch": 1.03, + "eval_logits/chosen": -2.9057369232177734, + "eval_logits/rejected": -2.8941798210144043, + "eval_logps/chosen": -349.8238220214844, + "eval_logps/rejected": -300.6510314941406, + "eval_loss": 0.5121060609817505, + "eval_rewards/accuracies": 0.777999997138977, + "eval_rewards/chosen": 0.30420342087745667, + "eval_rewards/margins": 1.9573808908462524, + "eval_rewards/rejected": -1.6531774997711182, + "eval_runtime": 499.8834, + "eval_samples_per_second": 4.001, + "eval_steps_per_second": 0.5, + "step": 1000 + }, + { + "epoch": 1.04, + "learning_rate": 3.6241867585151163e-07, + "logits/chosen": -2.9094786643981934, + "logits/rejected": -2.919285535812378, + "logps/chosen": -305.79949951171875, + "logps/rejected": -297.85455322265625, + "loss": 0.108, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.7003862857818604, + "rewards/margins": 4.974067687988281, + "rewards/rejected": -3.273681640625, + "step": 1010 + }, + { + "epoch": 1.05, + "learning_rate": 3.605051664753157e-07, + "logits/chosen": -2.880923271179199, + "logits/rejected": -2.887871742248535, + "logps/chosen": -312.16107177734375, + "logps/rejected": -289.467041015625, + "loss": 0.1136, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.358328104019165, + "rewards/margins": 4.708084583282471, + "rewards/rejected": -3.3497557640075684, + "step": 1020 + }, + { + "epoch": 1.06, + "learning_rate": 3.5859165709911975e-07, + "logits/chosen": -2.883192539215088, + "logits/rejected": -2.8778469562530518, + "logps/chosen": -327.26788330078125, + "logps/rejected": -333.3467712402344, + "loss": 0.0861, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.5257574319839478, + "rewards/margins": 4.850948333740234, + "rewards/rejected": -3.325191020965576, + "step": 1030 + }, + { + "epoch": 1.07, + "learning_rate": 3.566781477229238e-07, + "logits/chosen": -2.946524143218994, + "logits/rejected": -2.9164326190948486, + "logps/chosen": -303.1543884277344, + "logps/rejected": -342.10772705078125, + "loss": 0.1002, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.4854071140289307, + "rewards/margins": 5.697620391845703, + "rewards/rejected": -4.212213516235352, + "step": 1040 + }, + { + "epoch": 1.08, + "learning_rate": 3.547646383467279e-07, + "logits/chosen": -2.891878604888916, + "logits/rejected": -2.8879518508911133, + "logps/chosen": -317.4754943847656, + "logps/rejected": -299.55224609375, + "loss": 0.1082, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 1.3902817964553833, + "rewards/margins": 5.116191387176514, + "rewards/rejected": -3.725910186767578, + "step": 1050 + }, + { + "epoch": 1.09, + "learning_rate": 3.5285112897053194e-07, + "logits/chosen": -2.9003710746765137, + "logits/rejected": -2.902200222015381, + "logps/chosen": -315.47528076171875, + "logps/rejected": -322.63446044921875, + "loss": 0.0864, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4996274709701538, + "rewards/margins": 5.4629316329956055, + "rewards/rejected": -3.963304042816162, + "step": 1060 + }, + { + "epoch": 1.11, + "learning_rate": 3.50937619594336e-07, + "logits/chosen": -2.9368162155151367, + "logits/rejected": -2.9310317039489746, + "logps/chosen": -323.0782165527344, + "logps/rejected": -339.6298828125, + "loss": 0.113, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.027733564376831, + "rewards/margins": 4.9948883056640625, + "rewards/rejected": -3.9671554565429688, + "step": 1070 + }, + { + "epoch": 1.12, + "learning_rate": 3.4902411021814007e-07, + "logits/chosen": -2.9368138313293457, + "logits/rejected": -2.9042770862579346, + "logps/chosen": -353.8468017578125, + "logps/rejected": -307.30560302734375, + "loss": 0.1081, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.38088858127594, + "rewards/margins": 5.396637439727783, + "rewards/rejected": -4.015748500823975, + "step": 1080 + }, + { + "epoch": 1.13, + "learning_rate": 3.4711060084194413e-07, + "logits/chosen": -2.900344133377075, + "logits/rejected": -2.9066405296325684, + "logps/chosen": -332.6580505371094, + "logps/rejected": -306.313232421875, + "loss": 0.1185, + "rewards/accuracies": 0.9375, + "rewards/chosen": 1.6640781164169312, + "rewards/margins": 4.935359477996826, + "rewards/rejected": -3.2712814807891846, + "step": 1090 + }, + { + "epoch": 1.14, + "learning_rate": 3.4519709146574814e-07, + "logits/chosen": -2.927727460861206, + "logits/rejected": -2.8953537940979004, + "logps/chosen": -267.3210144042969, + "logps/rejected": -272.5216979980469, + "loss": 0.0928, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.3533536195755005, + "rewards/margins": 5.032728672027588, + "rewards/rejected": -3.6793746948242188, + "step": 1100 + }, + { + "epoch": 1.14, + "eval_logits/chosen": -2.90618896484375, + "eval_logits/rejected": -2.891117811203003, + "eval_logps/chosen": -352.6541442871094, + "eval_logps/rejected": -305.599853515625, + "eval_loss": 0.5322153568267822, + "eval_rewards/accuracies": 0.777999997138977, + "eval_rewards/chosen": 0.021169064566493034, + "eval_rewards/margins": 2.169229030609131, + "eval_rewards/rejected": -2.148059844970703, + "eval_runtime": 500.2131, + "eval_samples_per_second": 3.998, + "eval_steps_per_second": 0.5, + "step": 1100 + }, + { + "epoch": 1.15, + "learning_rate": 3.432835820895522e-07, + "logits/chosen": -2.8970720767974854, + "logits/rejected": -2.8721566200256348, + "logps/chosen": -315.86383056640625, + "logps/rejected": -312.5824890136719, + "loss": 0.0952, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2646430730819702, + "rewards/margins": 5.519986152648926, + "rewards/rejected": -4.255342483520508, + "step": 1110 + }, + { + "epoch": 1.16, + "learning_rate": 3.4137007271335626e-07, + "logits/chosen": -2.8670730590820312, + "logits/rejected": -2.8753676414489746, + "logps/chosen": -281.70355224609375, + "logps/rejected": -292.56365966796875, + "loss": 0.1105, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.9823983907699585, + "rewards/margins": 4.9709882736206055, + "rewards/rejected": -3.9885895252227783, + "step": 1120 + }, + { + "epoch": 1.17, + "learning_rate": 3.394565633371603e-07, + "logits/chosen": -2.9044623374938965, + "logits/rejected": -2.8968944549560547, + "logps/chosen": -370.77630615234375, + "logps/rejected": -357.85198974609375, + "loss": 0.1185, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 2.036125659942627, + "rewards/margins": 6.214476108551025, + "rewards/rejected": -4.17834997177124, + "step": 1130 + }, + { + "epoch": 1.18, + "learning_rate": 3.375430539609644e-07, + "logits/chosen": -2.8854241371154785, + "logits/rejected": -2.897651433944702, + "logps/chosen": -290.554443359375, + "logps/rejected": -339.0045166015625, + "loss": 0.0878, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.8847514390945435, + "rewards/margins": 5.345077991485596, + "rewards/rejected": -4.460326671600342, + "step": 1140 + }, + { + "epoch": 1.19, + "learning_rate": 3.3562954458476845e-07, + "logits/chosen": -2.916201114654541, + "logits/rejected": -2.897951364517212, + "logps/chosen": -355.0877380371094, + "logps/rejected": -328.5706481933594, + "loss": 0.0898, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.6507154703140259, + "rewards/margins": 6.26724910736084, + "rewards/rejected": -4.616534233093262, + "step": 1150 + }, + { + "epoch": 1.2, + "learning_rate": 3.337160352085725e-07, + "logits/chosen": -2.8670494556427, + "logits/rejected": -2.904618740081787, + "logps/chosen": -328.02496337890625, + "logps/rejected": -328.0072326660156, + "loss": 0.0763, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 1.1252260208129883, + "rewards/margins": 5.925239562988281, + "rewards/rejected": -4.800013542175293, + "step": 1160 + }, + { + "epoch": 1.21, + "learning_rate": 3.3180252583237657e-07, + "logits/chosen": -2.9289798736572266, + "logits/rejected": -2.929511547088623, + "logps/chosen": -325.98822021484375, + "logps/rejected": -297.0420837402344, + "loss": 0.0937, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.4292241334915161, + "rewards/margins": 6.024569988250732, + "rewards/rejected": -4.595345973968506, + "step": 1170 + }, + { + "epoch": 1.22, + "learning_rate": 3.2988901645618063e-07, + "logits/chosen": -2.879714012145996, + "logits/rejected": -2.850334644317627, + "logps/chosen": -335.61297607421875, + "logps/rejected": -316.1272888183594, + "loss": 0.0961, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.421682357788086, + "rewards/margins": 5.595412254333496, + "rewards/rejected": -4.173730373382568, + "step": 1180 + }, + { + "epoch": 1.23, + "learning_rate": 3.279755070799847e-07, + "logits/chosen": -2.9137988090515137, + "logits/rejected": -2.921215057373047, + "logps/chosen": -299.4486999511719, + "logps/rejected": -330.41265869140625, + "loss": 0.0991, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.5697805881500244, + "rewards/margins": 5.397409439086914, + "rewards/rejected": -3.8276290893554688, + "step": 1190 + }, + { + "epoch": 1.24, + "learning_rate": 3.260619977037887e-07, + "logits/chosen": -2.8730692863464355, + "logits/rejected": -2.890625476837158, + "logps/chosen": -322.72906494140625, + "logps/rejected": -322.37774658203125, + "loss": 0.1295, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.4574929475784302, + "rewards/margins": 5.438672065734863, + "rewards/rejected": -3.9811782836914062, + "step": 1200 + }, + { + "epoch": 1.24, + "eval_logits/chosen": -2.9265103340148926, + "eval_logits/rejected": -2.919196844100952, + "eval_logps/chosen": -354.89837646484375, + "eval_logps/rejected": -307.5298156738281, + "eval_loss": 0.5383667945861816, + "eval_rewards/accuracies": 0.7760000228881836, + "eval_rewards/chosen": -0.20325522124767303, + "eval_rewards/margins": 2.137800693511963, + "eval_rewards/rejected": -2.3410558700561523, + "eval_runtime": 500.2164, + "eval_samples_per_second": 3.998, + "eval_steps_per_second": 0.5, + "step": 1200 + }, + { + "epoch": 1.25, + "learning_rate": 3.2414848832759277e-07, + "logits/chosen": -2.9244723320007324, + "logits/rejected": -2.8997814655303955, + "logps/chosen": -300.6869201660156, + "logps/rejected": -309.40478515625, + "loss": 0.0912, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.3476300239562988, + "rewards/margins": 5.487320899963379, + "rewards/rejected": -4.139689922332764, + "step": 1210 + }, + { + "epoch": 1.26, + "learning_rate": 3.2223497895139683e-07, + "logits/chosen": -2.9218931198120117, + "logits/rejected": -2.923119068145752, + "logps/chosen": -338.5274353027344, + "logps/rejected": -345.64019775390625, + "loss": 0.1366, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.7827097177505493, + "rewards/margins": 5.954866409301758, + "rewards/rejected": -4.172156810760498, + "step": 1220 + }, + { + "epoch": 1.27, + "learning_rate": 3.203214695752009e-07, + "logits/chosen": -2.9061856269836426, + "logits/rejected": -2.9157071113586426, + "logps/chosen": -322.0141296386719, + "logps/rejected": -351.03692626953125, + "loss": 0.1512, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.399315595626831, + "rewards/margins": 5.718806266784668, + "rewards/rejected": -4.319490909576416, + "step": 1230 + }, + { + "epoch": 1.28, + "learning_rate": 3.1840796019900495e-07, + "logits/chosen": -2.8873629570007324, + "logits/rejected": -2.9147911071777344, + "logps/chosen": -346.08380126953125, + "logps/rejected": -356.04461669921875, + "loss": 0.1031, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.514539122581482, + "rewards/margins": 5.836313724517822, + "rewards/rejected": -4.321774482727051, + "step": 1240 + }, + { + "epoch": 1.29, + "learning_rate": 3.16494450822809e-07, + "logits/chosen": -2.8592748641967773, + "logits/rejected": -2.854814052581787, + "logps/chosen": -294.95758056640625, + "logps/rejected": -280.81866455078125, + "loss": 0.0988, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9549561738967896, + "rewards/margins": 4.784027099609375, + "rewards/rejected": -3.829070568084717, + "step": 1250 + }, + { + "epoch": 1.3, + "learning_rate": 3.145809414466131e-07, + "logits/chosen": -2.8863461017608643, + "logits/rejected": -2.870108127593994, + "logps/chosen": -314.99798583984375, + "logps/rejected": -317.19476318359375, + "loss": 0.0949, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.104218602180481, + "rewards/margins": 5.444802284240723, + "rewards/rejected": -4.340583801269531, + "step": 1260 + }, + { + "epoch": 1.31, + "learning_rate": 3.1266743207041714e-07, + "logits/chosen": -2.835925817489624, + "logits/rejected": -2.8321166038513184, + "logps/chosen": -360.5287170410156, + "logps/rejected": -305.8837890625, + "loss": 0.1046, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.5260238647460938, + "rewards/margins": 5.6372551918029785, + "rewards/rejected": -4.111231327056885, + "step": 1270 + }, + { + "epoch": 1.32, + "learning_rate": 3.107539226942212e-07, + "logits/chosen": -2.8840396404266357, + "logits/rejected": -2.902611017227173, + "logps/chosen": -306.69293212890625, + "logps/rejected": -348.4713439941406, + "loss": 0.1141, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.5585787296295166, + "rewards/margins": 6.097206115722656, + "rewards/rejected": -4.5386271476745605, + "step": 1280 + }, + { + "epoch": 1.33, + "learning_rate": 3.0884041331802526e-07, + "logits/chosen": -2.8800981044769287, + "logits/rejected": -2.886286973953247, + "logps/chosen": -299.43328857421875, + "logps/rejected": -306.6632385253906, + "loss": 0.1164, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.0270495414733887, + "rewards/margins": 5.2709574699401855, + "rewards/rejected": -4.243908405303955, + "step": 1290 + }, + { + "epoch": 1.34, + "learning_rate": 3.0692690394182927e-07, + "logits/chosen": -2.9457216262817383, + "logits/rejected": -2.9319682121276855, + "logps/chosen": -326.23724365234375, + "logps/rejected": -328.8221130371094, + "loss": 0.1093, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.3889268636703491, + "rewards/margins": 5.26351261138916, + "rewards/rejected": -3.874584913253784, + "step": 1300 + }, + { + "epoch": 1.34, + "eval_logits/chosen": -2.880734920501709, + "eval_logits/rejected": -2.870875597000122, + "eval_logps/chosen": -355.20660400390625, + "eval_logps/rejected": -309.2440490722656, + "eval_loss": 0.5469160079956055, + "eval_rewards/accuracies": 0.7860000133514404, + "eval_rewards/chosen": -0.23407350480556488, + "eval_rewards/margins": 2.2784061431884766, + "eval_rewards/rejected": -2.512479782104492, + "eval_runtime": 499.7502, + "eval_samples_per_second": 4.002, + "eval_steps_per_second": 0.5, + "step": 1300 + }, + { + "epoch": 1.35, + "learning_rate": 3.0501339456563334e-07, + "logits/chosen": -2.8287034034729004, + "logits/rejected": -2.807685613632202, + "logps/chosen": -320.2387390136719, + "logps/rejected": -309.42572021484375, + "loss": 0.1098, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3659682273864746, + "rewards/margins": 5.727939128875732, + "rewards/rejected": -4.3619704246521, + "step": 1310 + }, + { + "epoch": 1.36, + "learning_rate": 3.030998851894374e-07, + "logits/chosen": -2.8757596015930176, + "logits/rejected": -2.8729655742645264, + "logps/chosen": -276.88128662109375, + "logps/rejected": -288.6640625, + "loss": 0.1207, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.6556949615478516, + "rewards/margins": 4.997775077819824, + "rewards/rejected": -4.342080116271973, + "step": 1320 + }, + { + "epoch": 1.37, + "learning_rate": 3.0118637581324146e-07, + "logits/chosen": -2.8695642948150635, + "logits/rejected": -2.8470568656921387, + "logps/chosen": -314.7218017578125, + "logps/rejected": -314.4281921386719, + "loss": 0.1229, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.0547069311141968, + "rewards/margins": 6.196503162384033, + "rewards/rejected": -5.141795635223389, + "step": 1330 + }, + { + "epoch": 1.38, + "learning_rate": 2.992728664370455e-07, + "logits/chosen": -2.832125186920166, + "logits/rejected": -2.862847328186035, + "logps/chosen": -275.20269775390625, + "logps/rejected": -299.24664306640625, + "loss": 0.1286, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 0.4705425202846527, + "rewards/margins": 5.204878807067871, + "rewards/rejected": -4.734335899353027, + "step": 1340 + }, + { + "epoch": 1.39, + "learning_rate": 2.973593570608496e-07, + "logits/chosen": -2.8754680156707764, + "logits/rejected": -2.8437042236328125, + "logps/chosen": -312.1493835449219, + "logps/rejected": -306.14056396484375, + "loss": 0.1128, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 0.5046719908714294, + "rewards/margins": 4.981934070587158, + "rewards/rejected": -4.477262496948242, + "step": 1350 + }, + { + "epoch": 1.4, + "learning_rate": 2.9544584768465365e-07, + "logits/chosen": -2.907262086868286, + "logits/rejected": -2.8830528259277344, + "logps/chosen": -334.17474365234375, + "logps/rejected": -328.50006103515625, + "loss": 0.1181, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.6233978271484375, + "rewards/margins": 4.791070461273193, + "rewards/rejected": -4.167672634124756, + "step": 1360 + }, + { + "epoch": 1.41, + "learning_rate": 2.935323383084577e-07, + "logits/chosen": -2.8675036430358887, + "logits/rejected": -2.826355457305908, + "logps/chosen": -351.460205078125, + "logps/rejected": -333.004150390625, + "loss": 0.1086, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.3194541931152344, + "rewards/margins": 6.369981288909912, + "rewards/rejected": -5.050527095794678, + "step": 1370 + }, + { + "epoch": 1.43, + "learning_rate": 2.9161882893226177e-07, + "logits/chosen": -2.929363250732422, + "logits/rejected": -2.8654415607452393, + "logps/chosen": -325.78826904296875, + "logps/rejected": -314.6927490234375, + "loss": 0.1175, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.906810462474823, + "rewards/margins": 5.147686004638672, + "rewards/rejected": -4.240875720977783, + "step": 1380 + }, + { + "epoch": 1.44, + "learning_rate": 2.8970531955606583e-07, + "logits/chosen": -2.9362032413482666, + "logits/rejected": -2.911289930343628, + "logps/chosen": -345.9186706542969, + "logps/rejected": -316.5303955078125, + "loss": 0.1283, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.9989246129989624, + "rewards/margins": 5.8463263511657715, + "rewards/rejected": -4.847402095794678, + "step": 1390 + }, + { + "epoch": 1.45, + "learning_rate": 2.8779181017986984e-07, + "logits/chosen": -2.9036128520965576, + "logits/rejected": -2.927844285964966, + "logps/chosen": -364.14666748046875, + "logps/rejected": -337.1743469238281, + "loss": 0.1198, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.1708009243011475, + "rewards/margins": 5.978755950927734, + "rewards/rejected": -4.807954788208008, + "step": 1400 + }, + { + "epoch": 1.45, + "eval_logits/chosen": -2.8625848293304443, + "eval_logits/rejected": -2.8527026176452637, + "eval_logps/chosen": -356.7679443359375, + "eval_logps/rejected": -308.636962890625, + "eval_loss": 0.5245142579078674, + "eval_rewards/accuracies": 0.7720000147819519, + "eval_rewards/chosen": -0.390207976102829, + "eval_rewards/margins": 2.0615620613098145, + "eval_rewards/rejected": -2.451770067214966, + "eval_runtime": 499.4895, + "eval_samples_per_second": 4.004, + "eval_steps_per_second": 0.501, + "step": 1400 + }, + { + "epoch": 1.46, + "learning_rate": 2.858783008036739e-07, + "logits/chosen": -2.8276944160461426, + "logits/rejected": -2.841846466064453, + "logps/chosen": -348.5047302246094, + "logps/rejected": -304.74224853515625, + "loss": 0.1106, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.3548219203948975, + "rewards/margins": 5.720047950744629, + "rewards/rejected": -4.365225791931152, + "step": 1410 + }, + { + "epoch": 1.47, + "learning_rate": 2.8396479142747797e-07, + "logits/chosen": -2.8640191555023193, + "logits/rejected": -2.8874547481536865, + "logps/chosen": -309.53729248046875, + "logps/rejected": -313.78900146484375, + "loss": 0.1083, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 1.2925539016723633, + "rewards/margins": 5.640458106994629, + "rewards/rejected": -4.347904682159424, + "step": 1420 + }, + { + "epoch": 1.48, + "learning_rate": 2.8205128205128203e-07, + "logits/chosen": -2.8046693801879883, + "logits/rejected": -2.796954393386841, + "logps/chosen": -321.08740234375, + "logps/rejected": -325.5114440917969, + "loss": 0.1134, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.1036927700042725, + "rewards/margins": 5.514297962188721, + "rewards/rejected": -4.410605430603027, + "step": 1430 + }, + { + "epoch": 1.49, + "learning_rate": 2.801377726750861e-07, + "logits/chosen": -2.8305039405822754, + "logits/rejected": -2.8174493312835693, + "logps/chosen": -316.93365478515625, + "logps/rejected": -325.77801513671875, + "loss": 0.1059, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1122941970825195, + "rewards/margins": 5.836568832397461, + "rewards/rejected": -4.724274158477783, + "step": 1440 + }, + { + "epoch": 1.5, + "learning_rate": 2.7822426329889015e-07, + "logits/chosen": -2.8809874057769775, + "logits/rejected": -2.8768084049224854, + "logps/chosen": -316.82830810546875, + "logps/rejected": -330.55633544921875, + "loss": 0.1244, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.0005091428756714, + "rewards/margins": 5.6459479331970215, + "rewards/rejected": -4.645439147949219, + "step": 1450 + }, + { + "epoch": 1.51, + "learning_rate": 2.763107539226942e-07, + "logits/chosen": -2.9045822620391846, + "logits/rejected": -2.8883230686187744, + "logps/chosen": -330.44024658203125, + "logps/rejected": -350.8652648925781, + "loss": 0.109, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.5636141300201416, + "rewards/margins": 6.590689182281494, + "rewards/rejected": -5.027074813842773, + "step": 1460 + }, + { + "epoch": 1.52, + "learning_rate": 2.743972445464983e-07, + "logits/chosen": -2.889626979827881, + "logits/rejected": -2.865269422531128, + "logps/chosen": -308.53887939453125, + "logps/rejected": -303.1893005371094, + "loss": 0.103, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.100376844406128, + "rewards/margins": 5.918498992919922, + "rewards/rejected": -4.818121910095215, + "step": 1470 + }, + { + "epoch": 1.53, + "learning_rate": 2.7248373517030234e-07, + "logits/chosen": -2.8868496417999268, + "logits/rejected": -2.901533842086792, + "logps/chosen": -335.3681335449219, + "logps/rejected": -349.62115478515625, + "loss": 0.1122, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8411431312561035, + "rewards/margins": 5.649033546447754, + "rewards/rejected": -4.807890892028809, + "step": 1480 + }, + { + "epoch": 1.54, + "learning_rate": 2.705702257941064e-07, + "logits/chosen": -2.8572373390197754, + "logits/rejected": -2.869990110397339, + "logps/chosen": -396.15887451171875, + "logps/rejected": -329.05584716796875, + "loss": 0.1019, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.6818903684616089, + "rewards/margins": 5.830107688903809, + "rewards/rejected": -5.148218154907227, + "step": 1490 + }, + { + "epoch": 1.55, + "learning_rate": 2.686567164179104e-07, + "logits/chosen": -2.8778789043426514, + "logits/rejected": -2.8655364513397217, + "logps/chosen": -313.30499267578125, + "logps/rejected": -300.9337463378906, + "loss": 0.1122, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5041682720184326, + "rewards/margins": 5.160029411315918, + "rewards/rejected": -4.6558613777160645, + "step": 1500 + }, + { + "epoch": 1.55, + "eval_logits/chosen": -2.8394267559051514, + "eval_logits/rejected": -2.831911563873291, + "eval_logps/chosen": -359.64434814453125, + "eval_logps/rejected": -315.60784912109375, + "eval_loss": 0.552358865737915, + "eval_rewards/accuracies": 0.7860000133514404, + "eval_rewards/chosen": -0.6778488755226135, + "eval_rewards/margins": 2.471008539199829, + "eval_rewards/rejected": -3.1488571166992188, + "eval_runtime": 499.603, + "eval_samples_per_second": 4.003, + "eval_steps_per_second": 0.5, + "step": 1500 + }, + { + "epoch": 1.56, + "learning_rate": 2.6674320704171447e-07, + "logits/chosen": -2.8329129219055176, + "logits/rejected": -2.8036797046661377, + "logps/chosen": -313.36224365234375, + "logps/rejected": -286.71441650390625, + "loss": 0.0965, + "rewards/accuracies": 0.9437500238418579, + "rewards/chosen": 0.6077798008918762, + "rewards/margins": 5.281628608703613, + "rewards/rejected": -4.673849105834961, + "step": 1510 + }, + { + "epoch": 1.57, + "learning_rate": 2.6482969766551853e-07, + "logits/chosen": -2.8273487091064453, + "logits/rejected": -2.837484121322632, + "logps/chosen": -315.29852294921875, + "logps/rejected": -347.20343017578125, + "loss": 0.1045, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.0901168584823608, + "rewards/margins": 5.834365367889404, + "rewards/rejected": -4.744248390197754, + "step": 1520 + }, + { + "epoch": 1.58, + "learning_rate": 2.629161882893226e-07, + "logits/chosen": -2.8795628547668457, + "logits/rejected": -2.862488269805908, + "logps/chosen": -348.51812744140625, + "logps/rejected": -306.18621826171875, + "loss": 0.0929, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.8217048645019531, + "rewards/margins": 5.343213081359863, + "rewards/rejected": -4.52150821685791, + "step": 1530 + }, + { + "epoch": 1.59, + "learning_rate": 2.6100267891312666e-07, + "logits/chosen": -2.8448562622070312, + "logits/rejected": -2.8945586681365967, + "logps/chosen": -353.49822998046875, + "logps/rejected": -330.5164794921875, + "loss": 0.1189, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.4604480266571045, + "rewards/margins": 6.311491966247559, + "rewards/rejected": -4.851044178009033, + "step": 1540 + }, + { + "epoch": 1.6, + "learning_rate": 2.590891695369307e-07, + "logits/chosen": -2.8233683109283447, + "logits/rejected": -2.811624050140381, + "logps/chosen": -303.26861572265625, + "logps/rejected": -277.67401123046875, + "loss": 0.1288, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.3482983112335205, + "rewards/margins": 5.738016605377197, + "rewards/rejected": -4.389718055725098, + "step": 1550 + }, + { + "epoch": 1.61, + "learning_rate": 2.571756601607348e-07, + "logits/chosen": -2.8882668018341064, + "logits/rejected": -2.8739676475524902, + "logps/chosen": -373.87176513671875, + "logps/rejected": -320.65447998046875, + "loss": 0.1383, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 1.4747350215911865, + "rewards/margins": 6.119807243347168, + "rewards/rejected": -4.645071983337402, + "step": 1560 + }, + { + "epoch": 1.62, + "learning_rate": 2.5526215078453884e-07, + "logits/chosen": -2.880866527557373, + "logits/rejected": -2.874523878097534, + "logps/chosen": -335.191650390625, + "logps/rejected": -368.77197265625, + "loss": 0.0904, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.1102583408355713, + "rewards/margins": 6.102787017822266, + "rewards/rejected": -4.992527961730957, + "step": 1570 + }, + { + "epoch": 1.63, + "learning_rate": 2.533486414083429e-07, + "logits/chosen": -2.92406964302063, + "logits/rejected": -2.9168031215667725, + "logps/chosen": -344.280517578125, + "logps/rejected": -319.5175476074219, + "loss": 0.0894, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.228049874305725, + "rewards/margins": 5.966233253479004, + "rewards/rejected": -4.738183498382568, + "step": 1580 + }, + { + "epoch": 1.64, + "learning_rate": 2.5143513203214697e-07, + "logits/chosen": -2.846970796585083, + "logits/rejected": -2.824373245239258, + "logps/chosen": -339.29522705078125, + "logps/rejected": -325.4171142578125, + "loss": 0.0822, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.318579077720642, + "rewards/margins": 6.132169246673584, + "rewards/rejected": -4.813591003417969, + "step": 1590 + }, + { + "epoch": 1.65, + "learning_rate": 2.49521622655951e-07, + "logits/chosen": -2.776440143585205, + "logits/rejected": -2.780971050262451, + "logps/chosen": -346.0995178222656, + "logps/rejected": -348.487060546875, + "loss": 0.11, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.384293556213379, + "rewards/margins": 5.884185791015625, + "rewards/rejected": -4.499892234802246, + "step": 1600 + }, + { + "epoch": 1.65, + "eval_logits/chosen": -2.855889320373535, + "eval_logits/rejected": -2.846108913421631, + "eval_logps/chosen": -357.83306884765625, + "eval_logps/rejected": -311.6435241699219, + "eval_loss": 0.5355437397956848, + "eval_rewards/accuracies": 0.777999997138977, + "eval_rewards/chosen": -0.4967198967933655, + "eval_rewards/margins": 2.2557058334350586, + "eval_rewards/rejected": -2.7524256706237793, + "eval_runtime": 499.2286, + "eval_samples_per_second": 4.006, + "eval_steps_per_second": 0.501, + "step": 1600 + }, + { + "epoch": 1.66, + "learning_rate": 2.4760811327975504e-07, + "logits/chosen": -2.8780884742736816, + "logits/rejected": -2.892906665802002, + "logps/chosen": -377.72430419921875, + "logps/rejected": -332.82794189453125, + "loss": 0.1108, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 1.1861491203308105, + "rewards/margins": 5.642457485198975, + "rewards/rejected": -4.456308364868164, + "step": 1610 + }, + { + "epoch": 1.67, + "learning_rate": 2.456946039035591e-07, + "logits/chosen": -2.8738551139831543, + "logits/rejected": -2.876325845718384, + "logps/chosen": -366.89691162109375, + "logps/rejected": -316.91021728515625, + "loss": 0.1101, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 1.4744036197662354, + "rewards/margins": 5.994829177856445, + "rewards/rejected": -4.520425319671631, + "step": 1620 + }, + { + "epoch": 1.68, + "learning_rate": 2.4378109452736316e-07, + "logits/chosen": -2.869784355163574, + "logits/rejected": -2.8577165603637695, + "logps/chosen": -334.3713073730469, + "logps/rejected": -318.83404541015625, + "loss": 0.1145, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.051092505455017, + "rewards/margins": 5.565567970275879, + "rewards/rejected": -4.5144758224487305, + "step": 1630 + }, + { + "epoch": 1.69, + "learning_rate": 2.418675851511672e-07, + "logits/chosen": -2.8268637657165527, + "logits/rejected": -2.8105154037475586, + "logps/chosen": -351.9185791015625, + "logps/rejected": -307.3794860839844, + "loss": 0.081, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.2795428037643433, + "rewards/margins": 5.681187629699707, + "rewards/rejected": -4.401645183563232, + "step": 1640 + }, + { + "epoch": 1.7, + "learning_rate": 2.399540757749713e-07, + "logits/chosen": -2.8763937950134277, + "logits/rejected": -2.880305767059326, + "logps/chosen": -334.76763916015625, + "logps/rejected": -356.3011169433594, + "loss": 0.1503, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.0690113306045532, + "rewards/margins": 5.896848678588867, + "rewards/rejected": -4.8278374671936035, + "step": 1650 + }, + { + "epoch": 1.71, + "learning_rate": 2.3804056639877535e-07, + "logits/chosen": -2.8334081172943115, + "logits/rejected": -2.8573122024536133, + "logps/chosen": -297.3508605957031, + "logps/rejected": -295.17132568359375, + "loss": 0.1173, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 1.0110653638839722, + "rewards/margins": 5.1915178298950195, + "rewards/rejected": -4.1804518699646, + "step": 1660 + }, + { + "epoch": 1.72, + "learning_rate": 2.361270570225794e-07, + "logits/chosen": -2.8628039360046387, + "logits/rejected": -2.847696542739868, + "logps/chosen": -349.5933837890625, + "logps/rejected": -297.1512451171875, + "loss": 0.0924, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 1.53627610206604, + "rewards/margins": 5.680062770843506, + "rewards/rejected": -4.143786430358887, + "step": 1670 + }, + { + "epoch": 1.74, + "learning_rate": 2.3421354764638345e-07, + "logits/chosen": -2.8902525901794434, + "logits/rejected": -2.8761532306671143, + "logps/chosen": -328.5924377441406, + "logps/rejected": -352.31756591796875, + "loss": 0.0937, + "rewards/accuracies": 0.96875, + "rewards/chosen": 1.1631652116775513, + "rewards/margins": 5.828993320465088, + "rewards/rejected": -4.665827751159668, + "step": 1680 + }, + { + "epoch": 1.75, + "learning_rate": 2.323000382701875e-07, + "logits/chosen": -2.8299720287323, + "logits/rejected": -2.8690247535705566, + "logps/chosen": -341.2825622558594, + "logps/rejected": -324.95574951171875, + "loss": 0.1036, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 1.3289188146591187, + "rewards/margins": 5.7310357093811035, + "rewards/rejected": -4.402116775512695, + "step": 1690 + }, + { + "epoch": 1.76, + "learning_rate": 2.3038652889399157e-07, + "logits/chosen": -2.8278417587280273, + "logits/rejected": -2.8181910514831543, + "logps/chosen": -305.45843505859375, + "logps/rejected": -333.2713928222656, + "loss": 0.1092, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.8609102368354797, + "rewards/margins": 5.566982269287109, + "rewards/rejected": -4.7060723304748535, + "step": 1700 + }, + { + "epoch": 1.76, + "eval_logits/chosen": -2.8601489067077637, + "eval_logits/rejected": -2.8501791954040527, + "eval_logps/chosen": -358.9333190917969, + "eval_logps/rejected": -315.2887878417969, + "eval_loss": 0.5580697059631348, + "eval_rewards/accuracies": 0.7799999713897705, + "eval_rewards/chosen": -0.606745719909668, + "eval_rewards/margins": 2.510206699371338, + "eval_rewards/rejected": -3.1169521808624268, + "eval_runtime": 499.7093, + "eval_samples_per_second": 4.002, + "eval_steps_per_second": 0.5, + "step": 1700 + }, + { + "epoch": 1.77, + "learning_rate": 2.2847301951779563e-07, + "logits/chosen": -2.837820529937744, + "logits/rejected": -2.831998825073242, + "logps/chosen": -320.96673583984375, + "logps/rejected": -343.4007568359375, + "loss": 0.094, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.5116377472877502, + "rewards/margins": 5.757063388824463, + "rewards/rejected": -5.245425224304199, + "step": 1710 + }, + { + "epoch": 1.78, + "learning_rate": 2.265595101415997e-07, + "logits/chosen": -2.756493091583252, + "logits/rejected": -2.763211488723755, + "logps/chosen": -317.48724365234375, + "logps/rejected": -356.63848876953125, + "loss": 0.0945, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.9465705156326294, + "rewards/margins": 6.130054473876953, + "rewards/rejected": -5.1834845542907715, + "step": 1720 + }, + { + "epoch": 1.79, + "learning_rate": 2.2464600076540373e-07, + "logits/chosen": -2.8534083366394043, + "logits/rejected": -2.858447551727295, + "logps/chosen": -358.9150085449219, + "logps/rejected": -313.8974914550781, + "loss": 0.1016, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 1.2323501110076904, + "rewards/margins": 5.758856773376465, + "rewards/rejected": -4.526506423950195, + "step": 1730 + }, + { + "epoch": 1.8, + "learning_rate": 2.227324913892078e-07, + "logits/chosen": -2.912522077560425, + "logits/rejected": -2.897264003753662, + "logps/chosen": -322.6852111816406, + "logps/rejected": -323.4605407714844, + "loss": 0.1225, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.682808518409729, + "rewards/margins": 5.444243907928467, + "rewards/rejected": -4.761435508728027, + "step": 1740 + }, + { + "epoch": 1.81, + "learning_rate": 2.2081898201301186e-07, + "logits/chosen": -2.934340715408325, + "logits/rejected": -2.9151082038879395, + "logps/chosen": -325.00506591796875, + "logps/rejected": -297.88970947265625, + "loss": 0.1372, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 0.8912375569343567, + "rewards/margins": 5.291516304016113, + "rewards/rejected": -4.400278568267822, + "step": 1750 + }, + { + "epoch": 1.82, + "learning_rate": 2.1890547263681592e-07, + "logits/chosen": -2.913761615753174, + "logits/rejected": -2.885958671569824, + "logps/chosen": -306.1551818847656, + "logps/rejected": -300.98052978515625, + "loss": 0.0873, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.9583765268325806, + "rewards/margins": 5.630650520324707, + "rewards/rejected": -4.672274589538574, + "step": 1760 + }, + { + "epoch": 1.83, + "learning_rate": 2.1699196326061998e-07, + "logits/chosen": -2.8752079010009766, + "logits/rejected": -2.8725571632385254, + "logps/chosen": -303.1614685058594, + "logps/rejected": -306.0841369628906, + "loss": 0.2217, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 1.1173279285430908, + "rewards/margins": 5.881751537322998, + "rewards/rejected": -4.76442289352417, + "step": 1770 + }, + { + "epoch": 1.84, + "learning_rate": 2.1507845388442402e-07, + "logits/chosen": -2.869429588317871, + "logits/rejected": -2.8998007774353027, + "logps/chosen": -333.4254455566406, + "logps/rejected": -317.5960998535156, + "loss": 0.0849, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.9832474589347839, + "rewards/margins": 5.767635822296143, + "rewards/rejected": -4.784388065338135, + "step": 1780 + }, + { + "epoch": 1.85, + "learning_rate": 2.1316494450822808e-07, + "logits/chosen": -2.8548214435577393, + "logits/rejected": -2.875349521636963, + "logps/chosen": -364.0671691894531, + "logps/rejected": -348.7872009277344, + "loss": 0.0693, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.9190314412117004, + "rewards/margins": 6.185535430908203, + "rewards/rejected": -5.266503810882568, + "step": 1790 + }, + { + "epoch": 1.86, + "learning_rate": 2.1125143513203214e-07, + "logits/chosen": -2.9028079509735107, + "logits/rejected": -2.8680977821350098, + "logps/chosen": -333.8460388183594, + "logps/rejected": -318.41485595703125, + "loss": 0.0958, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.7885669469833374, + "rewards/margins": 5.999600410461426, + "rewards/rejected": -5.211032867431641, + "step": 1800 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -2.8540711402893066, + "eval_logits/rejected": -2.84735107421875, + "eval_logps/chosen": -360.1445617675781, + "eval_logps/rejected": -316.60345458984375, + "eval_loss": 0.5646550059318542, + "eval_rewards/accuracies": 0.7760000228881836, + "eval_rewards/chosen": -0.727875292301178, + "eval_rewards/margins": 2.5205445289611816, + "eval_rewards/rejected": -3.248420000076294, + "eval_runtime": 499.6295, + "eval_samples_per_second": 4.003, + "eval_steps_per_second": 0.5, + "step": 1800 + }, + { + "epoch": 1.87, + "learning_rate": 2.093379257558362e-07, + "logits/chosen": -2.8835785388946533, + "logits/rejected": -2.8600425720214844, + "logps/chosen": -308.075439453125, + "logps/rejected": -318.9173278808594, + "loss": 0.1363, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.6390537023544312, + "rewards/margins": 5.642648220062256, + "rewards/rejected": -5.003594398498535, + "step": 1810 + }, + { + "epoch": 1.88, + "learning_rate": 2.0742441637964026e-07, + "logits/chosen": -2.807243824005127, + "logits/rejected": -2.837427854537964, + "logps/chosen": -361.7774353027344, + "logps/rejected": -323.4786071777344, + "loss": 0.1082, + "rewards/accuracies": 0.9312499761581421, + "rewards/chosen": 1.1513266563415527, + "rewards/margins": 6.465740203857422, + "rewards/rejected": -5.314412593841553, + "step": 1820 + }, + { + "epoch": 1.89, + "learning_rate": 2.055109070034443e-07, + "logits/chosen": -2.865370035171509, + "logits/rejected": -2.8720335960388184, + "logps/chosen": -337.0066223144531, + "logps/rejected": -313.6158752441406, + "loss": 0.1168, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.4502478539943695, + "rewards/margins": 5.214186668395996, + "rewards/rejected": -4.763939380645752, + "step": 1830 + }, + { + "epoch": 1.9, + "learning_rate": 2.0359739762724836e-07, + "logits/chosen": -2.8631224632263184, + "logits/rejected": -2.8733620643615723, + "logps/chosen": -346.0338134765625, + "logps/rejected": -347.346435546875, + "loss": 0.0798, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.661190927028656, + "rewards/margins": 6.034590721130371, + "rewards/rejected": -5.373399257659912, + "step": 1840 + }, + { + "epoch": 1.91, + "learning_rate": 2.0168388825105242e-07, + "logits/chosen": -2.901468276977539, + "logits/rejected": -2.9260478019714355, + "logps/chosen": -327.5912780761719, + "logps/rejected": -345.6995544433594, + "loss": 0.0813, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.8747514486312866, + "rewards/margins": 5.987407684326172, + "rewards/rejected": -5.112656593322754, + "step": 1850 + }, + { + "epoch": 1.92, + "learning_rate": 1.997703788748565e-07, + "logits/chosen": -2.87198805809021, + "logits/rejected": -2.875633955001831, + "logps/chosen": -346.01959228515625, + "logps/rejected": -317.34527587890625, + "loss": 0.0973, + "rewards/accuracies": 0.956250011920929, + "rewards/chosen": 0.6318389177322388, + "rewards/margins": 5.922452449798584, + "rewards/rejected": -5.290614128112793, + "step": 1860 + }, + { + "epoch": 1.93, + "learning_rate": 1.9785686949866055e-07, + "logits/chosen": -2.863274335861206, + "logits/rejected": -2.876680612564087, + "logps/chosen": -330.6842346191406, + "logps/rejected": -353.137939453125, + "loss": 0.1081, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 1.135359764099121, + "rewards/margins": 6.191943168640137, + "rewards/rejected": -5.056582927703857, + "step": 1870 + }, + { + "epoch": 1.94, + "learning_rate": 1.9594336012246458e-07, + "logits/chosen": -2.888240337371826, + "logits/rejected": -2.903027296066284, + "logps/chosen": -323.24761962890625, + "logps/rejected": -327.24169921875, + "loss": 0.1001, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.8668416738510132, + "rewards/margins": 6.129169464111328, + "rewards/rejected": -5.262328147888184, + "step": 1880 + }, + { + "epoch": 1.95, + "learning_rate": 1.9402985074626865e-07, + "logits/chosen": -2.91502046585083, + "logits/rejected": -2.9133684635162354, + "logps/chosen": -339.2613830566406, + "logps/rejected": -322.16473388671875, + "loss": 0.0919, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.4730144441127777, + "rewards/margins": 5.7007269859313965, + "rewards/rejected": -5.227712154388428, + "step": 1890 + }, + { + "epoch": 1.96, + "learning_rate": 1.921163413700727e-07, + "logits/chosen": -2.9004921913146973, + "logits/rejected": -2.904614210128784, + "logps/chosen": -315.8312072753906, + "logps/rejected": -312.64373779296875, + "loss": 0.122, + "rewards/accuracies": 0.96875, + "rewards/chosen": 0.6923476457595825, + "rewards/margins": 6.188453197479248, + "rewards/rejected": -5.4961066246032715, + "step": 1900 + }, + { + "epoch": 1.96, + "eval_logits/chosen": -2.874333381652832, + "eval_logits/rejected": -2.8706977367401123, + "eval_logps/chosen": -360.60406494140625, + "eval_logps/rejected": -314.73699951171875, + "eval_loss": 0.5520058870315552, + "eval_rewards/accuracies": 0.777999997138977, + "eval_rewards/chosen": -0.773823082447052, + "eval_rewards/margins": 2.2879505157470703, + "eval_rewards/rejected": -3.0617735385894775, + "eval_runtime": 499.5877, + "eval_samples_per_second": 4.003, + "eval_steps_per_second": 0.5, + "step": 1900 + }, + { + "epoch": 1.97, + "learning_rate": 1.9020283199387677e-07, + "logits/chosen": -2.928802251815796, + "logits/rejected": -2.9289422035217285, + "logps/chosen": -347.2261047363281, + "logps/rejected": -324.1834411621094, + "loss": 0.1138, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.5844297409057617, + "rewards/margins": 5.202861785888672, + "rewards/rejected": -4.618431568145752, + "step": 1910 + }, + { + "epoch": 1.98, + "learning_rate": 1.8828932261768083e-07, + "logits/chosen": -2.8577020168304443, + "logits/rejected": -2.8607382774353027, + "logps/chosen": -292.01336669921875, + "logps/rejected": -294.7176208496094, + "loss": 0.1066, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.4617652893066406, + "rewards/margins": 5.312861919403076, + "rewards/rejected": -4.8510966300964355, + "step": 1920 + }, + { + "epoch": 1.99, + "learning_rate": 1.8637581324148487e-07, + "logits/chosen": -2.9372596740722656, + "logits/rejected": -2.918520450592041, + "logps/chosen": -331.2519226074219, + "logps/rejected": -349.9786071777344, + "loss": 0.105, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.7993395924568176, + "rewards/margins": 5.975264549255371, + "rewards/rejected": -5.175924777984619, + "step": 1930 + }, + { + "epoch": 2.0, + "learning_rate": 1.8446230386528893e-07, + "logits/chosen": -2.8874263763427734, + "logits/rejected": -2.875577926635742, + "logps/chosen": -295.83270263671875, + "logps/rejected": -297.573486328125, + "loss": 0.0743, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.7125546932220459, + "rewards/margins": 5.492268085479736, + "rewards/rejected": -4.779712677001953, + "step": 1940 + }, + { + "epoch": 2.01, + "learning_rate": 1.82548794489093e-07, + "logits/chosen": -2.850792646408081, + "logits/rejected": -2.8736298084259033, + "logps/chosen": -342.15423583984375, + "logps/rejected": -383.8853759765625, + "loss": 0.023, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 1.615793228149414, + "rewards/margins": 7.413217067718506, + "rewards/rejected": -5.797423362731934, + "step": 1950 + }, + { + "epoch": 2.02, + "learning_rate": 1.8063528511289706e-07, + "logits/chosen": -2.883690357208252, + "logits/rejected": -2.902615785598755, + "logps/chosen": -322.6307067871094, + "logps/rejected": -383.23187255859375, + "loss": 0.0214, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5958327054977417, + "rewards/margins": 6.831275939941406, + "rewards/rejected": -6.235442161560059, + "step": 1960 + }, + { + "epoch": 2.03, + "learning_rate": 1.7872177573670112e-07, + "logits/chosen": -2.8846828937530518, + "logits/rejected": -2.893906831741333, + "logps/chosen": -289.3814392089844, + "logps/rejected": -302.682373046875, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6643503904342651, + "rewards/margins": 6.625860691070557, + "rewards/rejected": -5.961510181427002, + "step": 1970 + }, + { + "epoch": 2.04, + "learning_rate": 1.7680826636050515e-07, + "logits/chosen": -2.87156343460083, + "logits/rejected": -2.8620166778564453, + "logps/chosen": -311.15234375, + "logps/rejected": -340.3420104980469, + "loss": 0.0197, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1246318817138672, + "rewards/margins": 7.64337158203125, + "rewards/rejected": -6.518739223480225, + "step": 1980 + }, + { + "epoch": 2.06, + "learning_rate": 1.7489475698430921e-07, + "logits/chosen": -2.8390986919403076, + "logits/rejected": -2.8416831493377686, + "logps/chosen": -321.42474365234375, + "logps/rejected": -331.75921630859375, + "loss": 0.0891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8244115114212036, + "rewards/margins": 7.307824611663818, + "rewards/rejected": -6.4834136962890625, + "step": 1990 + }, + { + "epoch": 2.07, + "learning_rate": 1.7298124760811328e-07, + "logits/chosen": -2.8488190174102783, + "logits/rejected": -2.827085256576538, + "logps/chosen": -349.63287353515625, + "logps/rejected": -342.38812255859375, + "loss": 0.0242, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.6490751504898071, + "rewards/margins": 7.272631645202637, + "rewards/rejected": -6.623556613922119, + "step": 2000 + }, + { + "epoch": 2.07, + "eval_logits/chosen": -2.860494375228882, + "eval_logits/rejected": -2.8475303649902344, + "eval_logps/chosen": -365.9564514160156, + "eval_logps/rejected": -326.2118835449219, + "eval_loss": 0.6110661029815674, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -1.3090580701828003, + "eval_rewards/margins": 2.900202989578247, + "eval_rewards/rejected": -4.209260940551758, + "eval_runtime": 499.6212, + "eval_samples_per_second": 4.003, + "eval_steps_per_second": 0.5, + "step": 2000 + }, + { + "epoch": 2.08, + "learning_rate": 1.7106773823191734e-07, + "logits/chosen": -2.896272659301758, + "logits/rejected": -2.8855504989624023, + "logps/chosen": -329.5340270996094, + "logps/rejected": -330.1529846191406, + "loss": 0.0161, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 1.1853575706481934, + "rewards/margins": 7.9635748863220215, + "rewards/rejected": -6.778217315673828, + "step": 2010 + }, + { + "epoch": 2.09, + "learning_rate": 1.691542288557214e-07, + "logits/chosen": -2.8596339225769043, + "logits/rejected": -2.8543241024017334, + "logps/chosen": -334.30126953125, + "logps/rejected": -366.9293518066406, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7636312246322632, + "rewards/margins": 8.40580940246582, + "rewards/rejected": -7.642178535461426, + "step": 2020 + }, + { + "epoch": 2.1, + "learning_rate": 1.6724071947952544e-07, + "logits/chosen": -2.858603000640869, + "logits/rejected": -2.8608195781707764, + "logps/chosen": -308.79205322265625, + "logps/rejected": -325.49432373046875, + "loss": 0.0295, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.8938325047492981, + "rewards/margins": 7.766848087310791, + "rewards/rejected": -6.873016357421875, + "step": 2030 + }, + { + "epoch": 2.11, + "learning_rate": 1.653272101033295e-07, + "logits/chosen": -2.877743721008301, + "logits/rejected": -2.8870015144348145, + "logps/chosen": -360.132080078125, + "logps/rejected": -332.0694580078125, + "loss": 0.0139, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.8420451283454895, + "rewards/margins": 7.829560279846191, + "rewards/rejected": -6.987515449523926, + "step": 2040 + }, + { + "epoch": 2.12, + "learning_rate": 1.6341370072713356e-07, + "logits/chosen": -2.908205509185791, + "logits/rejected": -2.8962578773498535, + "logps/chosen": -363.4471130371094, + "logps/rejected": -384.8677978515625, + "loss": 0.0182, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.9818631410598755, + "rewards/margins": 7.976204872131348, + "rewards/rejected": -6.9943413734436035, + "step": 2050 + }, + { + "epoch": 2.13, + "learning_rate": 1.6150019135093762e-07, + "logits/chosen": -2.9130074977874756, + "logits/rejected": -2.867249011993408, + "logps/chosen": -394.2611999511719, + "logps/rejected": -368.763427734375, + "loss": 0.0138, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1961416006088257, + "rewards/margins": 8.216410636901855, + "rewards/rejected": -7.020269870758057, + "step": 2060 + }, + { + "epoch": 2.14, + "learning_rate": 1.5958668197474169e-07, + "logits/chosen": -2.9357872009277344, + "logits/rejected": -2.9347925186157227, + "logps/chosen": -328.8543701171875, + "logps/rejected": -369.68292236328125, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9537331461906433, + "rewards/margins": 8.309990882873535, + "rewards/rejected": -7.356257438659668, + "step": 2070 + }, + { + "epoch": 2.15, + "learning_rate": 1.5767317259854572e-07, + "logits/chosen": -2.928189754486084, + "logits/rejected": -2.8961434364318848, + "logps/chosen": -307.6603088378906, + "logps/rejected": -294.9259338378906, + "loss": 0.0136, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.532218873500824, + "rewards/margins": 7.480643272399902, + "rewards/rejected": -6.948424339294434, + "step": 2080 + }, + { + "epoch": 2.16, + "learning_rate": 1.5575966322234978e-07, + "logits/chosen": -2.8677449226379395, + "logits/rejected": -2.8330647945404053, + "logps/chosen": -331.5555725097656, + "logps/rejected": -366.13372802734375, + "loss": 0.0149, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.19210126996040344, + "rewards/margins": 7.909059047698975, + "rewards/rejected": -7.716958522796631, + "step": 2090 + }, + { + "epoch": 2.17, + "learning_rate": 1.5384615384615385e-07, + "logits/chosen": -2.8884618282318115, + "logits/rejected": -2.8951830863952637, + "logps/chosen": -340.19757080078125, + "logps/rejected": -328.8606872558594, + "loss": 0.017, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4257478713989258, + "rewards/margins": 7.282183647155762, + "rewards/rejected": -6.856435298919678, + "step": 2100 + }, + { + "epoch": 2.17, + "eval_logits/chosen": -2.830853223800659, + "eval_logits/rejected": -2.8138229846954346, + "eval_logps/chosen": -369.8929748535156, + "eval_logps/rejected": -333.4358215332031, + "eval_loss": 0.6473292112350464, + "eval_rewards/accuracies": 0.7620000243186951, + "eval_rewards/chosen": -1.7027121782302856, + "eval_rewards/margins": 3.2289414405822754, + "eval_rewards/rejected": -4.93165397644043, + "eval_runtime": 499.8882, + "eval_samples_per_second": 4.001, + "eval_steps_per_second": 0.5, + "step": 2100 + }, + { + "epoch": 2.18, + "learning_rate": 1.519326444699579e-07, + "logits/chosen": -2.8692545890808105, + "logits/rejected": -2.8672494888305664, + "logps/chosen": -314.2371520996094, + "logps/rejected": -342.84564208984375, + "loss": 0.0121, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4992009103298187, + "rewards/margins": 8.060718536376953, + "rewards/rejected": -7.56151819229126, + "step": 2110 + }, + { + "epoch": 2.19, + "learning_rate": 1.5001913509376197e-07, + "logits/chosen": -2.8434972763061523, + "logits/rejected": -2.8411061763763428, + "logps/chosen": -338.0544738769531, + "logps/rejected": -347.9879455566406, + "loss": 0.018, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.41134971380233765, + "rewards/margins": 7.6756792068481445, + "rewards/rejected": -7.264329433441162, + "step": 2120 + }, + { + "epoch": 2.2, + "learning_rate": 1.4810562571756603e-07, + "logits/chosen": -2.9009549617767334, + "logits/rejected": -2.9190447330474854, + "logps/chosen": -303.60821533203125, + "logps/rejected": -355.43682861328125, + "loss": 0.0144, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2904825210571289, + "rewards/margins": 8.14781379699707, + "rewards/rejected": -7.857331275939941, + "step": 2130 + }, + { + "epoch": 2.21, + "learning_rate": 1.4619211634137007e-07, + "logits/chosen": -2.875998020172119, + "logits/rejected": -2.8702502250671387, + "logps/chosen": -329.28955078125, + "logps/rejected": -334.1612548828125, + "loss": 0.0208, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.40643930435180664, + "rewards/margins": 7.531008243560791, + "rewards/rejected": -7.124569892883301, + "step": 2140 + }, + { + "epoch": 2.22, + "learning_rate": 1.4427860696517413e-07, + "logits/chosen": -2.814622402191162, + "logits/rejected": -2.8140344619750977, + "logps/chosen": -369.78466796875, + "logps/rejected": -388.4234313964844, + "loss": 0.0156, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.7858397960662842, + "rewards/margins": 9.021352767944336, + "rewards/rejected": -8.235512733459473, + "step": 2150 + }, + { + "epoch": 2.23, + "learning_rate": 1.423650975889782e-07, + "logits/chosen": -2.8845508098602295, + "logits/rejected": -2.862572431564331, + "logps/chosen": -364.07391357421875, + "logps/rejected": -366.08074951171875, + "loss": 0.0181, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.4860958456993103, + "rewards/margins": 8.600242614746094, + "rewards/rejected": -8.11414623260498, + "step": 2160 + }, + { + "epoch": 2.24, + "learning_rate": 1.4045158821278225e-07, + "logits/chosen": -2.8357956409454346, + "logits/rejected": -2.8576788902282715, + "logps/chosen": -309.79833984375, + "logps/rejected": -352.6476135253906, + "loss": 0.0191, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.2403034269809723, + "rewards/margins": 7.672799110412598, + "rewards/rejected": -7.432496070861816, + "step": 2170 + }, + { + "epoch": 2.25, + "learning_rate": 1.3853807883658632e-07, + "logits/chosen": -2.843113660812378, + "logits/rejected": -2.8533434867858887, + "logps/chosen": -378.12005615234375, + "logps/rejected": -378.74713134765625, + "loss": 0.0168, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.9185358881950378, + "rewards/margins": 8.218305587768555, + "rewards/rejected": -7.299769401550293, + "step": 2180 + }, + { + "epoch": 2.26, + "learning_rate": 1.3662456946039035e-07, + "logits/chosen": -2.8196911811828613, + "logits/rejected": -2.8274574279785156, + "logps/chosen": -305.7922668457031, + "logps/rejected": -350.10772705078125, + "loss": 0.0117, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43905988335609436, + "rewards/margins": 8.206399917602539, + "rewards/rejected": -7.76733922958374, + "step": 2190 + }, + { + "epoch": 2.27, + "learning_rate": 1.3471106008419441e-07, + "logits/chosen": -2.8260867595672607, + "logits/rejected": -2.8560781478881836, + "logps/chosen": -319.553955078125, + "logps/rejected": -375.03985595703125, + "loss": 0.0153, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.368704617023468, + "rewards/margins": 8.854362487792969, + "rewards/rejected": -8.485657691955566, + "step": 2200 + }, + { + "epoch": 2.27, + "eval_logits/chosen": -2.80574369430542, + "eval_logits/rejected": -2.7871527671813965, + "eval_logps/chosen": -372.91436767578125, + "eval_logps/rejected": -338.00445556640625, + "eval_loss": 0.6657643914222717, + "eval_rewards/accuracies": 0.7699999809265137, + "eval_rewards/chosen": -2.004854440689087, + "eval_rewards/margins": 3.383664846420288, + "eval_rewards/rejected": -5.388518810272217, + "eval_runtime": 499.0606, + "eval_samples_per_second": 4.008, + "eval_steps_per_second": 0.501, + "step": 2200 + }, + { + "epoch": 2.28, + "learning_rate": 1.3279755070799848e-07, + "logits/chosen": -2.8646817207336426, + "logits/rejected": -2.8080735206604004, + "logps/chosen": -317.29412841796875, + "logps/rejected": -357.2666931152344, + "loss": 0.0234, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.43733105063438416, + "rewards/margins": 8.163047790527344, + "rewards/rejected": -7.725717067718506, + "step": 2210 + }, + { + "epoch": 2.29, + "learning_rate": 1.3088404133180254e-07, + "logits/chosen": -2.8100650310516357, + "logits/rejected": -2.7986183166503906, + "logps/chosen": -377.2625427246094, + "logps/rejected": -359.5750427246094, + "loss": 0.0141, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7715356349945068, + "rewards/margins": 8.111381530761719, + "rewards/rejected": -7.339845180511475, + "step": 2220 + }, + { + "epoch": 2.3, + "learning_rate": 1.289705319556066e-07, + "logits/chosen": -2.802110195159912, + "logits/rejected": -2.7565503120422363, + "logps/chosen": -313.4336853027344, + "logps/rejected": -351.89703369140625, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36583349108695984, + "rewards/margins": 8.023749351501465, + "rewards/rejected": -8.389582633972168, + "step": 2230 + }, + { + "epoch": 2.31, + "learning_rate": 1.2705702257941064e-07, + "logits/chosen": -2.841660499572754, + "logits/rejected": -2.8044819831848145, + "logps/chosen": -352.5540771484375, + "logps/rejected": -346.982666015625, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3794718086719513, + "rewards/margins": 8.321603775024414, + "rewards/rejected": -7.942131996154785, + "step": 2240 + }, + { + "epoch": 2.32, + "learning_rate": 1.251435132032147e-07, + "logits/chosen": -2.850743532180786, + "logits/rejected": -2.8343441486358643, + "logps/chosen": -339.0596008300781, + "logps/rejected": -389.5365905761719, + "loss": 0.0971, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.7595340013504028, + "rewards/margins": 8.639189720153809, + "rewards/rejected": -7.8796563148498535, + "step": 2250 + }, + { + "epoch": 2.33, + "learning_rate": 1.2323000382701873e-07, + "logits/chosen": -2.8145759105682373, + "logits/rejected": -2.8445546627044678, + "logps/chosen": -345.4944152832031, + "logps/rejected": -367.5064392089844, + "loss": 0.0459, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.18377834558486938, + "rewards/margins": 8.928794860839844, + "rewards/rejected": -8.745016098022461, + "step": 2260 + }, + { + "epoch": 2.34, + "learning_rate": 1.213164944508228e-07, + "logits/chosen": -2.867486000061035, + "logits/rejected": -2.849432945251465, + "logps/chosen": -380.73876953125, + "logps/rejected": -375.8220520019531, + "loss": 0.0109, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.4815305173397064, + "rewards/margins": 8.172357559204102, + "rewards/rejected": -7.690826416015625, + "step": 2270 + }, + { + "epoch": 2.35, + "learning_rate": 1.1940298507462686e-07, + "logits/chosen": -2.882193088531494, + "logits/rejected": -2.8630523681640625, + "logps/chosen": -309.0014953613281, + "logps/rejected": -321.5120849609375, + "loss": 0.0166, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.2122192680835724, + "rewards/margins": 7.754061698913574, + "rewards/rejected": -7.966280937194824, + "step": 2280 + }, + { + "epoch": 2.37, + "learning_rate": 1.1748947569843092e-07, + "logits/chosen": -2.8814282417297363, + "logits/rejected": -2.8726353645324707, + "logps/chosen": -348.33294677734375, + "logps/rejected": -351.8730163574219, + "loss": 0.0209, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.46276918053627014, + "rewards/margins": 8.149224281311035, + "rewards/rejected": -7.686454772949219, + "step": 2290 + }, + { + "epoch": 2.38, + "learning_rate": 1.1557596632223497e-07, + "logits/chosen": -2.839801073074341, + "logits/rejected": -2.853092670440674, + "logps/chosen": -348.4852600097656, + "logps/rejected": -325.7494201660156, + "loss": 0.0215, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.5979126691818237, + "rewards/margins": 7.9850335121154785, + "rewards/rejected": -7.387121677398682, + "step": 2300 + }, + { + "epoch": 2.38, + "eval_logits/chosen": -2.840635299682617, + "eval_logits/rejected": -2.82096266746521, + "eval_logps/chosen": -370.4217529296875, + "eval_logps/rejected": -336.2837219238281, + "eval_loss": 0.6721770167350769, + "eval_rewards/accuracies": 0.777999997138977, + "eval_rewards/chosen": -1.7555896043777466, + "eval_rewards/margins": 3.46085524559021, + "eval_rewards/rejected": -5.216445446014404, + "eval_runtime": 499.1898, + "eval_samples_per_second": 4.006, + "eval_steps_per_second": 0.501, + "step": 2300 + }, + { + "epoch": 2.39, + "learning_rate": 1.1366245694603903e-07, + "logits/chosen": -2.863748550415039, + "logits/rejected": -2.858935594558716, + "logps/chosen": -323.34002685546875, + "logps/rejected": -339.47552490234375, + "loss": 0.0189, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": -0.011857276782393456, + "rewards/margins": 7.8068037033081055, + "rewards/rejected": -7.818660736083984, + "step": 2310 + }, + { + "epoch": 2.4, + "learning_rate": 1.1174894756984308e-07, + "logits/chosen": -2.8732120990753174, + "logits/rejected": -2.865734577178955, + "logps/chosen": -312.9558410644531, + "logps/rejected": -335.70782470703125, + "loss": 0.0157, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.2408231794834137, + "rewards/margins": 8.007491111755371, + "rewards/rejected": -7.76666784286499, + "step": 2320 + }, + { + "epoch": 2.41, + "learning_rate": 1.0983543819364714e-07, + "logits/chosen": -2.8552298545837402, + "logits/rejected": -2.8243603706359863, + "logps/chosen": -353.9219665527344, + "logps/rejected": -354.1855163574219, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.15999922156333923, + "rewards/margins": 8.37447738647461, + "rewards/rejected": -8.2144775390625, + "step": 2330 + }, + { + "epoch": 2.42, + "learning_rate": 1.079219288174512e-07, + "logits/chosen": -2.831439971923828, + "logits/rejected": -2.8317112922668457, + "logps/chosen": -287.0343017578125, + "logps/rejected": -336.79254150390625, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.47535672783851624, + "rewards/margins": 8.473257064819336, + "rewards/rejected": -7.997899055480957, + "step": 2340 + }, + { + "epoch": 2.43, + "learning_rate": 1.0600841944125525e-07, + "logits/chosen": -2.8530073165893555, + "logits/rejected": -2.8641648292541504, + "logps/chosen": -333.83526611328125, + "logps/rejected": -347.7726135253906, + "loss": 0.0181, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.020889759063720703, + "rewards/margins": 8.293529510498047, + "rewards/rejected": -8.31441879272461, + "step": 2350 + }, + { + "epoch": 2.44, + "learning_rate": 1.0409491006505931e-07, + "logits/chosen": -2.828545331954956, + "logits/rejected": -2.837430238723755, + "logps/chosen": -356.1022033691406, + "logps/rejected": -366.8138732910156, + "loss": 0.0431, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.3571850657463074, + "rewards/margins": 8.733023643493652, + "rewards/rejected": -8.375839233398438, + "step": 2360 + }, + { + "epoch": 2.45, + "learning_rate": 1.0218140068886336e-07, + "logits/chosen": -2.76576566696167, + "logits/rejected": -2.765660285949707, + "logps/chosen": -331.6526794433594, + "logps/rejected": -355.6332092285156, + "loss": 0.0163, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.01460124272853136, + "rewards/margins": 7.919270992279053, + "rewards/rejected": -7.933871269226074, + "step": 2370 + }, + { + "epoch": 2.46, + "learning_rate": 1.0026789131266743e-07, + "logits/chosen": -2.7807886600494385, + "logits/rejected": -2.781130075454712, + "logps/chosen": -338.2047424316406, + "logps/rejected": -384.52490234375, + "loss": 0.0147, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.1773863583803177, + "rewards/margins": 9.542524337768555, + "rewards/rejected": -9.365138053894043, + "step": 2380 + }, + { + "epoch": 2.47, + "learning_rate": 9.835438193647149e-08, + "logits/chosen": -2.7662460803985596, + "logits/rejected": -2.75819730758667, + "logps/chosen": -353.21697998046875, + "logps/rejected": -345.8397216796875, + "loss": 0.0181, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.11677594482898712, + "rewards/margins": 8.12705135345459, + "rewards/rejected": -8.243826866149902, + "step": 2390 + }, + { + "epoch": 2.48, + "learning_rate": 9.644087256027554e-08, + "logits/chosen": -2.786858320236206, + "logits/rejected": -2.7591071128845215, + "logps/chosen": -313.44757080078125, + "logps/rejected": -352.6889953613281, + "loss": 0.0128, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04645581170916557, + "rewards/margins": 9.099355697631836, + "rewards/rejected": -9.14581298828125, + "step": 2400 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -2.8010993003845215, + "eval_logits/rejected": -2.77828049659729, + "eval_logps/chosen": -373.6317443847656, + "eval_logps/rejected": -340.2928161621094, + "eval_loss": 0.6771968007087708, + "eval_rewards/accuracies": 0.7699999809265137, + "eval_rewards/chosen": -2.0765931606292725, + "eval_rewards/margins": 3.540759563446045, + "eval_rewards/rejected": -5.617353439331055, + "eval_runtime": 498.7331, + "eval_samples_per_second": 4.01, + "eval_steps_per_second": 0.501, + "step": 2400 + }, + { + "epoch": 2.49, + "learning_rate": 9.45273631840796e-08, + "logits/chosen": -2.83256196975708, + "logits/rejected": -2.817204713821411, + "logps/chosen": -316.6646423339844, + "logps/rejected": -365.89801025390625, + "loss": 0.0216, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.18594422936439514, + "rewards/margins": 8.41205883026123, + "rewards/rejected": -8.226114273071289, + "step": 2410 + }, + { + "epoch": 2.5, + "learning_rate": 9.261385380788366e-08, + "logits/chosen": -2.800245523452759, + "logits/rejected": -2.8351614475250244, + "logps/chosen": -310.83319091796875, + "logps/rejected": -358.3487854003906, + "loss": 0.0132, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.3432609736919403, + "rewards/margins": 8.361449241638184, + "rewards/rejected": -8.0181884765625, + "step": 2420 + }, + { + "epoch": 2.51, + "learning_rate": 9.070034443168771e-08, + "logits/chosen": -2.827857732772827, + "logits/rejected": -2.817192792892456, + "logps/chosen": -314.1051025390625, + "logps/rejected": -337.80670166015625, + "loss": 0.0122, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.01949559524655342, + "rewards/margins": 8.174915313720703, + "rewards/rejected": -8.19441032409668, + "step": 2430 + }, + { + "epoch": 2.52, + "learning_rate": 8.878683505549177e-08, + "logits/chosen": -2.8365912437438965, + "logits/rejected": -2.8096940517425537, + "logps/chosen": -315.1009216308594, + "logps/rejected": -345.9786376953125, + "loss": 0.0168, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.3356740176677704, + "rewards/margins": 8.638749122619629, + "rewards/rejected": -8.303074836730957, + "step": 2440 + }, + { + "epoch": 2.53, + "learning_rate": 8.687332567929582e-08, + "logits/chosen": -2.7972190380096436, + "logits/rejected": -2.7952070236206055, + "logps/chosen": -331.07855224609375, + "logps/rejected": -361.274658203125, + "loss": 0.0164, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.03445981815457344, + "rewards/margins": 8.762239456176758, + "rewards/rejected": -8.796700477600098, + "step": 2450 + }, + { + "epoch": 2.54, + "learning_rate": 8.495981630309988e-08, + "logits/chosen": -2.782491683959961, + "logits/rejected": -2.798245906829834, + "logps/chosen": -388.80487060546875, + "logps/rejected": -385.5155944824219, + "loss": 0.0169, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13862161338329315, + "rewards/margins": 8.304658889770508, + "rewards/rejected": -8.166036605834961, + "step": 2460 + }, + { + "epoch": 2.55, + "learning_rate": 8.304630692690395e-08, + "logits/chosen": -2.7862939834594727, + "logits/rejected": -2.800889015197754, + "logps/chosen": -340.80450439453125, + "logps/rejected": -324.3976135253906, + "loss": 0.0209, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.042531587183475494, + "rewards/margins": 9.009973526000977, + "rewards/rejected": -8.967442512512207, + "step": 2470 + }, + { + "epoch": 2.56, + "learning_rate": 8.1132797550708e-08, + "logits/chosen": -2.860032796859741, + "logits/rejected": -2.8409759998321533, + "logps/chosen": -374.20684814453125, + "logps/rejected": -355.40087890625, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.24193501472473145, + "rewards/margins": 8.717119216918945, + "rewards/rejected": -8.475184440612793, + "step": 2480 + }, + { + "epoch": 2.57, + "learning_rate": 7.921928817451206e-08, + "logits/chosen": -2.8591599464416504, + "logits/rejected": -2.8410227298736572, + "logps/chosen": -335.44976806640625, + "logps/rejected": -363.7392578125, + "loss": 0.0115, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.2703700363636017, + "rewards/margins": 9.15362548828125, + "rewards/rejected": -8.883255004882812, + "step": 2490 + }, + { + "epoch": 2.58, + "learning_rate": 7.73057787983161e-08, + "logits/chosen": -2.8341917991638184, + "logits/rejected": -2.8292124271392822, + "logps/chosen": -357.0975341796875, + "logps/rejected": -373.5026550292969, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19130437076091766, + "rewards/margins": 8.608747482299805, + "rewards/rejected": -8.80005168914795, + "step": 2500 + }, + { + "epoch": 2.58, + "eval_logits/chosen": -2.7877330780029297, + "eval_logits/rejected": -2.7656519412994385, + "eval_logps/chosen": -375.72076416015625, + "eval_logps/rejected": -343.2975769042969, + "eval_loss": 0.6893309354782104, + "eval_rewards/accuracies": 0.7739999890327454, + "eval_rewards/chosen": -2.285490036010742, + "eval_rewards/margins": 3.6323366165161133, + "eval_rewards/rejected": -5.9178266525268555, + "eval_runtime": 499.4861, + "eval_samples_per_second": 4.004, + "eval_steps_per_second": 0.501, + "step": 2500 + }, + { + "epoch": 2.59, + "learning_rate": 7.539226942212017e-08, + "logits/chosen": -2.7619540691375732, + "logits/rejected": -2.774597644805908, + "logps/chosen": -318.77679443359375, + "logps/rejected": -376.66656494140625, + "loss": 0.0153, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.632733941078186, + "rewards/margins": 8.800397872924805, + "rewards/rejected": -9.433133125305176, + "step": 2510 + }, + { + "epoch": 2.6, + "learning_rate": 7.347876004592423e-08, + "logits/chosen": -2.8549551963806152, + "logits/rejected": -2.868101119995117, + "logps/chosen": -300.10797119140625, + "logps/rejected": -321.92755126953125, + "loss": 0.0138, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.15570883452892303, + "rewards/margins": 8.049636840820312, + "rewards/rejected": -8.205345153808594, + "step": 2520 + }, + { + "epoch": 2.61, + "learning_rate": 7.156525066972828e-08, + "logits/chosen": -2.8064002990722656, + "logits/rejected": -2.805536985397339, + "logps/chosen": -322.34814453125, + "logps/rejected": -366.001220703125, + "loss": 0.0134, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.4449862837791443, + "rewards/margins": 8.548049926757812, + "rewards/rejected": -8.993036270141602, + "step": 2530 + }, + { + "epoch": 2.62, + "learning_rate": 6.965174129353234e-08, + "logits/chosen": -2.832437515258789, + "logits/rejected": -2.830235004425049, + "logps/chosen": -385.7052307128906, + "logps/rejected": -384.5820617675781, + "loss": 0.0196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25569507479667664, + "rewards/margins": 8.444574356079102, + "rewards/rejected": -8.188879013061523, + "step": 2540 + }, + { + "epoch": 2.63, + "learning_rate": 6.773823191733639e-08, + "logits/chosen": -2.768073320388794, + "logits/rejected": -2.8013052940368652, + "logps/chosen": -321.265625, + "logps/rejected": -381.2618103027344, + "loss": 0.0159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49428802728652954, + "rewards/margins": 9.570723533630371, + "rewards/rejected": -10.065011978149414, + "step": 2550 + }, + { + "epoch": 2.64, + "learning_rate": 6.582472254114045e-08, + "logits/chosen": -2.8302340507507324, + "logits/rejected": -2.800615072250366, + "logps/chosen": -331.8475036621094, + "logps/rejected": -366.77777099609375, + "loss": 0.0167, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.04411726072430611, + "rewards/margins": 9.26185131072998, + "rewards/rejected": -9.305968284606934, + "step": 2560 + }, + { + "epoch": 2.65, + "learning_rate": 6.391121316494451e-08, + "logits/chosen": -2.8136613368988037, + "logits/rejected": -2.8105883598327637, + "logps/chosen": -348.7919006347656, + "logps/rejected": -370.4637145996094, + "loss": 0.013, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.12355004251003265, + "rewards/margins": 8.435611724853516, + "rewards/rejected": -8.559160232543945, + "step": 2570 + }, + { + "epoch": 2.66, + "learning_rate": 6.199770378874856e-08, + "logits/chosen": -2.8767693042755127, + "logits/rejected": -2.8680922985076904, + "logps/chosen": -328.68975830078125, + "logps/rejected": -375.6171875, + "loss": 0.0135, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.11462025344371796, + "rewards/margins": 8.414809226989746, + "rewards/rejected": -8.52942943572998, + "step": 2580 + }, + { + "epoch": 2.67, + "learning_rate": 6.008419441255262e-08, + "logits/chosen": -2.7816498279571533, + "logits/rejected": -2.7839863300323486, + "logps/chosen": -349.9986267089844, + "logps/rejected": -363.5763854980469, + "loss": 0.0155, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.08176889270544052, + "rewards/margins": 8.521749496459961, + "rewards/rejected": -8.439981460571289, + "step": 2590 + }, + { + "epoch": 2.69, + "learning_rate": 5.817068503635668e-08, + "logits/chosen": -2.8423211574554443, + "logits/rejected": -2.8518404960632324, + "logps/chosen": -342.69873046875, + "logps/rejected": -373.8768005371094, + "loss": 0.0118, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.09304714947938919, + "rewards/margins": 8.714309692382812, + "rewards/rejected": -8.807355880737305, + "step": 2600 + }, + { + "epoch": 2.69, + "eval_logits/chosen": -2.796252965927124, + "eval_logits/rejected": -2.776649236679077, + "eval_logps/chosen": -377.9178771972656, + "eval_logps/rejected": -346.03851318359375, + "eval_loss": 0.6936560273170471, + "eval_rewards/accuracies": 0.765999972820282, + "eval_rewards/chosen": -2.505204200744629, + "eval_rewards/margins": 3.686720609664917, + "eval_rewards/rejected": -6.191924095153809, + "eval_runtime": 499.6825, + "eval_samples_per_second": 4.003, + "eval_steps_per_second": 0.5, + "step": 2600 + }, + { + "epoch": 2.7, + "learning_rate": 5.6257175660160735e-08, + "logits/chosen": -2.8501954078674316, + "logits/rejected": -2.8585927486419678, + "logps/chosen": -330.80767822265625, + "logps/rejected": -349.1222229003906, + "loss": 0.0099, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.3493829369544983, + "rewards/margins": 8.983522415161133, + "rewards/rejected": -8.634138107299805, + "step": 2610 + }, + { + "epoch": 2.71, + "learning_rate": 5.4343666283964784e-08, + "logits/chosen": -2.84842586517334, + "logits/rejected": -2.8621630668640137, + "logps/chosen": -339.36761474609375, + "logps/rejected": -352.922607421875, + "loss": 0.0115, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.036901332437992096, + "rewards/margins": 9.127528190612793, + "rewards/rejected": -9.164429664611816, + "step": 2620 + }, + { + "epoch": 2.72, + "learning_rate": 5.243015690776884e-08, + "logits/chosen": -2.8185672760009766, + "logits/rejected": -2.8214826583862305, + "logps/chosen": -354.1401672363281, + "logps/rejected": -386.0504455566406, + "loss": 0.0123, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.03651510551571846, + "rewards/margins": 9.25926685333252, + "rewards/rejected": -9.295781135559082, + "step": 2630 + }, + { + "epoch": 2.73, + "learning_rate": 5.05166475315729e-08, + "logits/chosen": -2.7941179275512695, + "logits/rejected": -2.777252674102783, + "logps/chosen": -359.3582763671875, + "logps/rejected": -361.58026123046875, + "loss": 0.0118, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -1.0606005191802979, + "rewards/margins": 8.198938369750977, + "rewards/rejected": -9.259538650512695, + "step": 2640 + }, + { + "epoch": 2.74, + "learning_rate": 4.860313815537696e-08, + "logits/chosen": -2.8406758308410645, + "logits/rejected": -2.7846803665161133, + "logps/chosen": -351.44287109375, + "logps/rejected": -380.54876708984375, + "loss": 0.0231, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5040115714073181, + "rewards/margins": 8.634989738464355, + "rewards/rejected": -9.13900089263916, + "step": 2650 + }, + { + "epoch": 2.75, + "learning_rate": 4.668962877918101e-08, + "logits/chosen": -2.7726569175720215, + "logits/rejected": -2.7721505165100098, + "logps/chosen": -340.0118103027344, + "logps/rejected": -435.3929748535156, + "loss": 0.0142, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.6642410755157471, + "rewards/margins": 9.08407211303711, + "rewards/rejected": -9.748313903808594, + "step": 2660 + }, + { + "epoch": 2.76, + "learning_rate": 4.477611940298507e-08, + "logits/chosen": -2.775136947631836, + "logits/rejected": -2.7988369464874268, + "logps/chosen": -354.5794677734375, + "logps/rejected": -364.4373474121094, + "loss": 0.0159, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.16665682196617126, + "rewards/margins": 8.789073944091797, + "rewards/rejected": -8.955730438232422, + "step": 2670 + }, + { + "epoch": 2.77, + "learning_rate": 4.2862610026789124e-08, + "logits/chosen": -2.7736451625823975, + "logits/rejected": -2.815216064453125, + "logps/chosen": -350.3184509277344, + "logps/rejected": -368.71844482421875, + "loss": 0.0162, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.13622841238975525, + "rewards/margins": 9.146402359008789, + "rewards/rejected": -9.010174751281738, + "step": 2680 + }, + { + "epoch": 2.78, + "learning_rate": 4.0949100650593186e-08, + "logits/chosen": -2.8010916709899902, + "logits/rejected": -2.8058903217315674, + "logps/chosen": -297.83599853515625, + "logps/rejected": -378.70892333984375, + "loss": 0.0113, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.6468523144721985, + "rewards/margins": 8.825592041015625, + "rewards/rejected": -9.472444534301758, + "step": 2690 + }, + { + "epoch": 2.79, + "learning_rate": 3.903559127439724e-08, + "logits/chosen": -2.7962281703948975, + "logits/rejected": -2.7654640674591064, + "logps/chosen": -352.7866516113281, + "logps/rejected": -370.88226318359375, + "loss": 0.0127, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.05651530623435974, + "rewards/margins": 8.773322105407715, + "rewards/rejected": -8.716808319091797, + "step": 2700 + }, + { + "epoch": 2.79, + "eval_logits/chosen": -2.7651772499084473, + "eval_logits/rejected": -2.7437028884887695, + "eval_logps/chosen": -377.7486267089844, + "eval_logps/rejected": -345.6202392578125, + "eval_loss": 0.6868348717689514, + "eval_rewards/accuracies": 0.7699999809265137, + "eval_rewards/chosen": -2.4882779121398926, + "eval_rewards/margins": 3.661820411682129, + "eval_rewards/rejected": -6.1500983238220215, + "eval_runtime": 499.7574, + "eval_samples_per_second": 4.002, + "eval_steps_per_second": 0.5, + "step": 2700 + }, + { + "epoch": 2.8, + "learning_rate": 3.71220818982013e-08, + "logits/chosen": -2.811039924621582, + "logits/rejected": -2.7763166427612305, + "logps/chosen": -326.4056701660156, + "logps/rejected": -355.9825744628906, + "loss": 0.0112, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.38241684436798096, + "rewards/margins": 8.648158073425293, + "rewards/rejected": -9.0305757522583, + "step": 2710 + }, + { + "epoch": 2.81, + "learning_rate": 3.520857252200535e-08, + "logits/chosen": -2.7573506832122803, + "logits/rejected": -2.760462522506714, + "logps/chosen": -360.5853271484375, + "logps/rejected": -408.0951843261719, + "loss": 0.0193, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13503775000572205, + "rewards/margins": 9.324322700500488, + "rewards/rejected": -9.18928337097168, + "step": 2720 + }, + { + "epoch": 2.82, + "learning_rate": 3.3295063145809414e-08, + "logits/chosen": -2.8028712272644043, + "logits/rejected": -2.8158020973205566, + "logps/chosen": -311.5306396484375, + "logps/rejected": -366.0463562011719, + "loss": 0.0156, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.29903894662857056, + "rewards/margins": 8.486640930175781, + "rewards/rejected": -8.78568172454834, + "step": 2730 + }, + { + "epoch": 2.83, + "learning_rate": 3.138155376961347e-08, + "logits/chosen": -2.806452512741089, + "logits/rejected": -2.800107002258301, + "logps/chosen": -352.19903564453125, + "logps/rejected": -366.1153869628906, + "loss": 0.0088, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.05723688751459122, + "rewards/margins": 8.847169876098633, + "rewards/rejected": -8.789933204650879, + "step": 2740 + }, + { + "epoch": 2.84, + "learning_rate": 2.9468044393417525e-08, + "logits/chosen": -2.8288140296936035, + "logits/rejected": -2.819532871246338, + "logps/chosen": -319.1861572265625, + "logps/rejected": -365.1618347167969, + "loss": 0.0146, + "rewards/accuracies": 0.981249988079071, + "rewards/chosen": 0.272183895111084, + "rewards/margins": 8.840131759643555, + "rewards/rejected": -8.567949295043945, + "step": 2750 + }, + { + "epoch": 2.85, + "learning_rate": 2.755453501722158e-08, + "logits/chosen": -2.7816238403320312, + "logits/rejected": -2.7954201698303223, + "logps/chosen": -333.7603759765625, + "logps/rejected": -378.86236572265625, + "loss": 0.0179, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.39214563369750977, + "rewards/margins": 8.563885688781738, + "rewards/rejected": -8.956029891967773, + "step": 2760 + }, + { + "epoch": 2.86, + "learning_rate": 2.564102564102564e-08, + "logits/chosen": -2.768709659576416, + "logits/rejected": -2.787172794342041, + "logps/chosen": -321.18896484375, + "logps/rejected": -379.5654296875, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08633549511432648, + "rewards/margins": 8.47877311706543, + "rewards/rejected": -8.565108299255371, + "step": 2770 + }, + { + "epoch": 2.87, + "learning_rate": 2.3727516264829695e-08, + "logits/chosen": -2.8109827041625977, + "logits/rejected": -2.797381639480591, + "logps/chosen": -363.3731689453125, + "logps/rejected": -408.6158142089844, + "loss": 0.014, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.22468623518943787, + "rewards/margins": 9.187434196472168, + "rewards/rejected": -9.412120819091797, + "step": 2780 + }, + { + "epoch": 2.88, + "learning_rate": 2.1814006888633754e-08, + "logits/chosen": -2.75907564163208, + "logits/rejected": -2.7653088569641113, + "logps/chosen": -343.4961242675781, + "logps/rejected": -381.5200500488281, + "loss": 0.0168, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.4573872983455658, + "rewards/margins": 9.753976821899414, + "rewards/rejected": -9.296588897705078, + "step": 2790 + }, + { + "epoch": 2.89, + "learning_rate": 1.990049751243781e-08, + "logits/chosen": -2.74211049079895, + "logits/rejected": -2.787912368774414, + "logps/chosen": -346.13165283203125, + "logps/rejected": -360.9788513183594, + "loss": 0.0149, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.0645522028207779, + "rewards/margins": 8.483091354370117, + "rewards/rejected": -8.418540000915527, + "step": 2800 + }, + { + "epoch": 2.89, + "eval_logits/chosen": -2.7622878551483154, + "eval_logits/rejected": -2.7409138679504395, + "eval_logps/chosen": -376.6426086425781, + "eval_logps/rejected": -344.5401306152344, + "eval_loss": 0.6851915121078491, + "eval_rewards/accuracies": 0.7699999809265137, + "eval_rewards/chosen": -2.3776779174804688, + "eval_rewards/margins": 3.66440486907959, + "eval_rewards/rejected": -6.0420823097229, + "eval_runtime": 499.4818, + "eval_samples_per_second": 4.004, + "eval_steps_per_second": 0.501, + "step": 2800 + }, + { + "epoch": 2.9, + "learning_rate": 1.7986988136241865e-08, + "logits/chosen": -2.7683703899383545, + "logits/rejected": -2.7223987579345703, + "logps/chosen": -355.0417175292969, + "logps/rejected": -366.55462646484375, + "loss": 0.0167, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.14950156211853027, + "rewards/margins": 8.354609489440918, + "rewards/rejected": -8.504112243652344, + "step": 2810 + }, + { + "epoch": 2.91, + "learning_rate": 1.6073478760045924e-08, + "logits/chosen": -2.766155481338501, + "logits/rejected": -2.7364845275878906, + "logps/chosen": -369.193115234375, + "logps/rejected": -349.85107421875, + "loss": 0.0144, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.08105222135782242, + "rewards/margins": 8.850485801696777, + "rewards/rejected": -8.76943302154541, + "step": 2820 + }, + { + "epoch": 2.92, + "learning_rate": 1.4159969383849981e-08, + "logits/chosen": -2.757335662841797, + "logits/rejected": -2.735102415084839, + "logps/chosen": -344.85015869140625, + "logps/rejected": -362.55096435546875, + "loss": 0.0128, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.3330300748348236, + "rewards/margins": 8.652267456054688, + "rewards/rejected": -8.985297203063965, + "step": 2830 + }, + { + "epoch": 2.93, + "learning_rate": 1.2246460007654037e-08, + "logits/chosen": -2.774275302886963, + "logits/rejected": -2.7630369663238525, + "logps/chosen": -316.2999572753906, + "logps/rejected": -335.5289611816406, + "loss": 0.0233, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.3553180992603302, + "rewards/margins": 8.010857582092285, + "rewards/rejected": -8.366175651550293, + "step": 2840 + }, + { + "epoch": 2.94, + "learning_rate": 1.0332950631458094e-08, + "logits/chosen": -2.743933916091919, + "logits/rejected": -2.7586922645568848, + "logps/chosen": -362.8529052734375, + "logps/rejected": -367.4391784667969, + "loss": 0.0147, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.43926841020584106, + "rewards/margins": 9.078571319580078, + "rewards/rejected": -8.639303207397461, + "step": 2850 + }, + { + "epoch": 2.95, + "learning_rate": 8.419441255262151e-09, + "logits/chosen": -2.7890658378601074, + "logits/rejected": -2.7745556831359863, + "logps/chosen": -299.62799072265625, + "logps/rejected": -343.54083251953125, + "loss": 0.0124, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": -0.6124383211135864, + "rewards/margins": 8.287772178649902, + "rewards/rejected": -8.900211334228516, + "step": 2860 + }, + { + "epoch": 2.96, + "learning_rate": 6.505931879066207e-09, + "logits/chosen": -2.8043646812438965, + "logits/rejected": -2.8070147037506104, + "logps/chosen": -316.2520446777344, + "logps/rejected": -351.5118103027344, + "loss": 0.0224, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.41816624999046326, + "rewards/margins": 8.223325729370117, + "rewards/rejected": -8.64149284362793, + "step": 2870 + }, + { + "epoch": 2.97, + "learning_rate": 4.592422502870264e-09, + "logits/chosen": -2.7751667499542236, + "logits/rejected": -2.775235414505005, + "logps/chosen": -327.26068115234375, + "logps/rejected": -369.03839111328125, + "loss": 0.0158, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.019028399139642715, + "rewards/margins": 9.00733757019043, + "rewards/rejected": -9.026365280151367, + "step": 2880 + }, + { + "epoch": 2.98, + "learning_rate": 2.6789131266743202e-09, + "logits/chosen": -2.8183040618896484, + "logits/rejected": -2.7807929515838623, + "logps/chosen": -337.8726806640625, + "logps/rejected": -379.15032958984375, + "loss": 0.0222, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -0.21741366386413574, + "rewards/margins": 8.339401245117188, + "rewards/rejected": -8.556814193725586, + "step": 2890 + }, + { + "epoch": 3.0, + "learning_rate": 7.654037504783773e-10, + "logits/chosen": -2.847442865371704, + "logits/rejected": -2.8513660430908203, + "logps/chosen": -318.14996337890625, + "logps/rejected": -386.6273498535156, + "loss": 0.0105, + "rewards/accuracies": 0.9937499761581421, + "rewards/chosen": 0.028394419699907303, + "rewards/margins": 8.268038749694824, + "rewards/rejected": -8.239645004272461, + "step": 2900 + }, + { + "epoch": 3.0, + "eval_logits/chosen": -2.7662971019744873, + "eval_logits/rejected": -2.7454662322998047, + "eval_logps/chosen": -376.1639099121094, + "eval_logps/rejected": -344.25634765625, + "eval_loss": 0.6832324266433716, + "eval_rewards/accuracies": 0.7639999985694885, + "eval_rewards/chosen": -2.3298091888427734, + "eval_rewards/margins": 3.683899402618408, + "eval_rewards/rejected": -6.013708591461182, + "eval_runtime": 499.2953, + "eval_samples_per_second": 4.006, + "eval_steps_per_second": 0.501, + "step": 2900 + }, + { + "epoch": 3.0, + "step": 2904, + "total_flos": 0.0, + "train_loss": 0.2301063642942298, + "train_runtime": 127972.4677, + "train_samples_per_second": 1.453, + "train_steps_per_second": 0.023 + } + ], + "logging_steps": 10, + "max_steps": 2904, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "trial_name": null, + "trial_params": null +}