diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8703 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0017308524448292, + "eval_steps": 500, + "global_step": 578, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0017308524448290783, + "grad_norm": 46.5, + "kl": 0.0, + "learning_rate": 1.4285714285714287e-07, + "logits/chosen": -6239313.454545454, + "logits/rejected": -4940240.761904762, + "logps/chosen": -236.17436079545453, + "logps/rejected": -209.70107886904762, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0034617048896581565, + "grad_norm": 38.25, + "kl": 0.0, + "learning_rate": 2.8571428571428575e-07, + "logits/chosen": -2665428.3076923075, + "logits/rejected": -1073632.5263157894, + "logps/chosen": -155.0839562049279, + "logps/rejected": -255.23524876644737, + "loss": 0.5, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 2 + }, + { + "epoch": 0.005192557334487235, + "grad_norm": 39.25, + "kl": 0.11506986618041992, + "learning_rate": 4.285714285714286e-07, + "logits/chosen": -1627763.2, + "logits/rejected": -1337906.8235294118, + "logps/chosen": -214.50914713541667, + "logps/rejected": -210.7369887408088, + "loss": 0.5011, + "rewards/chosen": -0.005750782291094462, + "rewards/margins": -0.006572681913773219, + "rewards/rejected": 0.0008218996226787567, + "step": 3 + }, + { + "epoch": 0.006923409779316313, + "grad_norm": 33.5, + "kl": 0.13516950607299805, + "learning_rate": 5.714285714285715e-07, + "logits/chosen": -9900144.0, + "logits/rejected": 2390790.4, + "logps/chosen": -236.11850873161765, + "logps/rejected": -187.40989583333334, + "loss": 0.5052, + "rewards/chosen": 0.00019769019940320184, + "rewards/margins": -0.03652356716932035, + "rewards/rejected": 0.03672125736872355, + "step": 4 + }, + { + "epoch": 0.00865426222414539, + "grad_norm": 39.0, + "kl": 0.16583895683288574, + "learning_rate": 7.142857142857143e-07, + "logits/chosen": 273089.5, + "logits/rejected": -8537414.0, + "logps/chosen": -191.99412536621094, + "logps/rejected": -250.5272216796875, + "loss": 0.5, + "rewards/chosen": -0.018333029001951218, + "rewards/margins": -0.014739224454388022, + "rewards/rejected": -0.0035938045475631952, + "step": 5 + }, + { + "epoch": 0.01038511466897447, + "grad_norm": 37.0, + "kl": 0.1269383430480957, + "learning_rate": 8.571428571428572e-07, + "logits/chosen": 625734.125, + "logits/rejected": -3864760.5, + "logps/chosen": -130.1186065673828, + "logps/rejected": -263.1868591308594, + "loss": 0.4974, + "rewards/chosen": 0.005134785547852516, + "rewards/margins": 0.02763364464044571, + "rewards/rejected": -0.022498859092593193, + "step": 6 + }, + { + "epoch": 0.012115967113803548, + "grad_norm": 45.5, + "kl": 0.058301448822021484, + "learning_rate": 1.0000000000000002e-06, + "logits/chosen": 11427891.2, + "logits/rejected": -8692000.94117647, + "logps/chosen": -247.42198893229167, + "logps/rejected": -318.3026769301471, + "loss": 0.4879, + "rewards/chosen": 0.020061949888865154, + "rewards/margins": 0.09507267031015135, + "rewards/rejected": -0.07501072042128619, + "step": 7 + }, + { + "epoch": 0.013846819558632626, + "grad_norm": 34.75, + "kl": 0.10779595375061035, + "learning_rate": 1.142857142857143e-06, + "logits/chosen": 9745310.315789474, + "logits/rejected": 4968272.0, + "logps/chosen": -266.39432565789474, + "logps/rejected": -200.4144568810096, + "loss": 0.4926, + "rewards/chosen": 0.03307872383218063, + "rewards/margins": 0.08410431619597833, + "rewards/rejected": -0.0510255923637977, + "step": 8 + }, + { + "epoch": 0.015577672003461706, + "grad_norm": 35.75, + "kl": 0.0004693269729614258, + "learning_rate": 1.2857142857142856e-06, + "logits/chosen": 2292229.3333333335, + "logits/rejected": -5866576.571428572, + "logps/chosen": -164.06934950086804, + "logps/rejected": -255.37636021205358, + "loss": 0.4857, + "rewards/chosen": 0.017767790291044448, + "rewards/margins": 0.12696768035964362, + "rewards/rejected": -0.10919989006859916, + "step": 9 + }, + { + "epoch": 0.01730852444829078, + "grad_norm": 58.0, + "kl": 0.08134031295776367, + "learning_rate": 1.4285714285714286e-06, + "logits/chosen": -2206551.1428571427, + "logits/rejected": -2327785.3333333335, + "logps/chosen": -210.65478515625, + "logps/rejected": -399.8186848958333, + "loss": 0.4595, + "rewards/chosen": 0.041608184576034546, + "rewards/margins": 0.30469969577259487, + "rewards/rejected": -0.2630915111965603, + "step": 10 + }, + { + "epoch": 0.019039376893119863, + "grad_norm": 33.0, + "kl": 0.06869983673095703, + "learning_rate": 1.5714285714285714e-06, + "logits/chosen": 5004582.315789473, + "logits/rejected": 15390077.538461538, + "logps/chosen": -168.22392835115133, + "logps/rejected": -250.3277869591346, + "loss": 0.4842, + "rewards/chosen": 0.025214639149214093, + "rewards/margins": 0.15980646617499442, + "rewards/rejected": -0.13459182702578032, + "step": 11 + }, + { + "epoch": 0.02077022933794894, + "grad_norm": 35.25, + "kl": 0.03198128938674927, + "learning_rate": 1.7142857142857145e-06, + "logits/chosen": -6019438.5, + "logits/rejected": -12351150.0, + "logps/chosen": -203.1639404296875, + "logps/rejected": -248.1376495361328, + "loss": 0.4481, + "rewards/chosen": 0.022723043337464333, + "rewards/margins": 0.46131726540625095, + "rewards/rejected": -0.4385942220687866, + "step": 12 + }, + { + "epoch": 0.02250108178277802, + "grad_norm": 27.375, + "kl": 0.03030562400817871, + "learning_rate": 1.8571428571428573e-06, + "logits/chosen": -1771344.705882353, + "logits/rejected": 4848613.333333333, + "logps/chosen": -156.79733455882354, + "logps/rejected": -143.33746744791668, + "loss": 0.4641, + "rewards/chosen": 0.040735574329600614, + "rewards/margins": 0.31227434055477965, + "rewards/rejected": -0.27153876622517903, + "step": 13 + }, + { + "epoch": 0.024231934227607096, + "grad_norm": 31.75, + "kl": 0.0, + "learning_rate": 2.0000000000000003e-06, + "logits/chosen": -402277.15789473685, + "logits/rejected": 8271367.384615385, + "logps/chosen": -283.14478824013156, + "logps/rejected": -204.32831280048077, + "loss": 0.4584, + "rewards/chosen": 0.008756919126761587, + "rewards/margins": 0.42188657892619064, + "rewards/rejected": -0.41312965979942906, + "step": 14 + }, + { + "epoch": 0.025962786672436174, + "grad_norm": 36.75, + "kl": 0.0032711029052734375, + "learning_rate": 2.1428571428571427e-06, + "logits/chosen": -811063.5882352941, + "logits/rejected": -12282100.266666668, + "logps/chosen": -176.79848345588235, + "logps/rejected": -325.2669270833333, + "loss": 0.3948, + "rewards/chosen": 0.08565068244934082, + "rewards/margins": 0.9919464588165283, + "rewards/rejected": -0.9062957763671875, + "step": 15 + }, + { + "epoch": 0.027693639117265252, + "grad_norm": 30.375, + "kl": 0.007568359375, + "learning_rate": 2.285714285714286e-06, + "logits/chosen": -1084260.0, + "logits/rejected": 31534450.666666668, + "logps/chosen": -210.231787109375, + "logps/rejected": -276.17873128255206, + "loss": 0.4251, + "rewards/chosen": 0.08407727479934693, + "rewards/margins": 0.8169409394264221, + "rewards/rejected": -0.7328636646270752, + "step": 16 + }, + { + "epoch": 0.02942449156209433, + "grad_norm": 30.0, + "kl": 0.0, + "learning_rate": 2.428571428571429e-06, + "logits/chosen": -1036163.6923076923, + "logits/rejected": 961232.6315789474, + "logps/chosen": -268.2616624098558, + "logps/rejected": -225.40373149671052, + "loss": 0.4054, + "rewards/chosen": 0.010549396276473999, + "rewards/margins": 0.7161211230252919, + "rewards/rejected": -0.7055717267488179, + "step": 17 + }, + { + "epoch": 0.03115534400692341, + "grad_norm": 26.25, + "kl": 0.0, + "learning_rate": 2.571428571428571e-06, + "logits/chosen": 4818987.555555556, + "logits/rejected": -827468.8571428572, + "logps/chosen": -251.50027126736111, + "logps/rejected": -213.26834542410714, + "loss": 0.4032, + "rewards/chosen": 0.03222567505306668, + "rewards/margins": 1.113093238028269, + "rewards/rejected": -1.0808675629752023, + "step": 18 + }, + { + "epoch": 0.03288619645175249, + "grad_norm": 24.625, + "kl": 0.0, + "learning_rate": 2.7142857142857144e-06, + "logits/chosen": -1698852.380952381, + "logits/rejected": 2662217.6363636362, + "logps/chosen": -172.64027622767858, + "logps/rejected": -205.12626509232953, + "loss": 0.4404, + "rewards/chosen": -0.05192979744502476, + "rewards/margins": 0.9354158980505807, + "rewards/rejected": -0.9873456954956055, + "step": 19 + }, + { + "epoch": 0.03461704889658156, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 2.8571428571428573e-06, + "logits/chosen": 6553660.0, + "logits/rejected": -5781368.444444444, + "logps/chosen": -143.31090436662947, + "logps/rejected": -192.53050401475696, + "loss": 0.3749, + "rewards/chosen": 0.09408829041889735, + "rewards/margins": 1.171789647094787, + "rewards/rejected": -1.0777013566758897, + "step": 20 + }, + { + "epoch": 0.036347901341410645, + "grad_norm": 20.375, + "kl": 0.012153387069702148, + "learning_rate": 3e-06, + "logits/chosen": 1939333.8666666667, + "logits/rejected": 1052395.0588235294, + "logps/chosen": -177.10651041666668, + "logps/rejected": -190.28341854319854, + "loss": 0.3776, + "rewards/chosen": 0.045921965440114336, + "rewards/margins": 1.3893623017797283, + "rewards/rejected": -1.343440336339614, + "step": 21 + }, + { + "epoch": 0.038078753786239726, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 3.142857142857143e-06, + "logits/chosen": 3900064.5, + "logits/rejected": 2436417.0, + "logps/chosen": -188.06832885742188, + "logps/rejected": -307.90692138671875, + "loss": 0.3542, + "rewards/chosen": -0.08758784085512161, + "rewards/margins": 1.7587207481265068, + "rewards/rejected": -1.8463085889816284, + "step": 22 + }, + { + "epoch": 0.0398096062310688, + "grad_norm": 22.625, + "kl": 0.000914454460144043, + "learning_rate": 3.285714285714286e-06, + "logits/chosen": -42746.86666666667, + "logits/rejected": 1372338.8235294118, + "logps/chosen": -226.461865234375, + "logps/rejected": -258.68396714154414, + "loss": 0.3754, + "rewards/chosen": -0.14824188550313314, + "rewards/margins": 1.6312208166309432, + "rewards/rejected": -1.7794627021340763, + "step": 23 + }, + { + "epoch": 0.04154045867589788, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 3.428571428571429e-06, + "logits/chosen": -1758418.3333333333, + "logits/rejected": 7633656.0, + "logps/chosen": -143.17743598090277, + "logps/rejected": -136.41956438337053, + "loss": 0.4083, + "rewards/chosen": -0.15943604045444065, + "rewards/margins": 1.379212019935487, + "rewards/rejected": -1.5386480603899275, + "step": 24 + }, + { + "epoch": 0.043271311120726956, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 3.5714285714285718e-06, + "logits/chosen": -3627161.777777778, + "logits/rejected": -1282915.142857143, + "logps/chosen": -189.07590060763889, + "logps/rejected": -282.23025948660717, + "loss": 0.3667, + "rewards/chosen": -0.05861267778608534, + "rewards/margins": 2.27708803850507, + "rewards/rejected": -2.335700716291155, + "step": 25 + }, + { + "epoch": 0.04500216356555604, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 3.7142857142857146e-06, + "logits/chosen": 3197816.8571428573, + "logits/rejected": -24990.666666666668, + "logps/chosen": -133.38133893694197, + "logps/rejected": -252.71739366319446, + "loss": 0.3729, + "rewards/chosen": -0.2894209793635777, + "rewards/margins": 1.5688878127506802, + "rewards/rejected": -1.8583087921142578, + "step": 26 + }, + { + "epoch": 0.04673301601038511, + "grad_norm": 22.0, + "kl": 0.0, + "learning_rate": 3.857142857142858e-06, + "logits/chosen": 8530238.857142856, + "logits/rejected": -6356856.888888889, + "logps/chosen": -228.37095424107142, + "logps/rejected": -290.1540256076389, + "loss": 0.3187, + "rewards/chosen": -0.04233703442982265, + "rewards/margins": 2.3515855594286843, + "rewards/rejected": -2.393922593858507, + "step": 27 + }, + { + "epoch": 0.04846386845521419, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 4.000000000000001e-06, + "logits/chosen": 11541931.294117646, + "logits/rejected": -18899741.866666667, + "logps/chosen": -161.64244887408088, + "logps/rejected": -298.1225911458333, + "loss": 0.3221, + "rewards/chosen": -0.20178667236776912, + "rewards/margins": 3.408641692703845, + "rewards/rejected": -3.6104283650716145, + "step": 28 + }, + { + "epoch": 0.050194720900043274, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 4.1428571428571435e-06, + "logits/chosen": -8176106.0, + "logits/rejected": 8892046.0, + "logps/chosen": -202.91030883789062, + "logps/rejected": -320.7770690917969, + "loss": 0.3603, + "rewards/chosen": -0.17716556787490845, + "rewards/margins": 2.8113109469413757, + "rewards/rejected": -2.988476514816284, + "step": 29 + }, + { + "epoch": 0.05192557334487235, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 4.2857142857142855e-06, + "logits/chosen": 1024021.4736842106, + "logits/rejected": 4154760.0, + "logps/chosen": -166.43669048108552, + "logps/rejected": -148.49478853665866, + "loss": 0.4417, + "rewards/chosen": -0.38098611329731186, + "rewards/margins": 1.4392063704579465, + "rewards/rejected": -1.8201924837552583, + "step": 30 + }, + { + "epoch": 0.05365642578970143, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 4.428571428571429e-06, + "logits/chosen": 12327768.727272727, + "logits/rejected": -4335401.904761905, + "logps/chosen": -141.8250732421875, + "logps/rejected": -224.83528645833334, + "loss": 0.3034, + "rewards/chosen": -0.04076832803812894, + "rewards/margins": 2.3152393841898284, + "rewards/rejected": -2.3560077122279575, + "step": 31 + }, + { + "epoch": 0.055387278234530504, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 4.571428571428572e-06, + "logits/chosen": -7621606.545454546, + "logits/rejected": -5030670.857142857, + "logps/chosen": -168.98353160511363, + "logps/rejected": -240.65597098214286, + "loss": 0.294, + "rewards/chosen": -0.1550229029221968, + "rewards/margins": 2.5555503843150613, + "rewards/rejected": -2.710573287237258, + "step": 32 + }, + { + "epoch": 0.057118130679359586, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 4.714285714285715e-06, + "logits/chosen": -4837878.153846154, + "logits/rejected": -2727320.4210526315, + "logps/chosen": -260.5615985576923, + "logps/rejected": -309.63633326480266, + "loss": 0.3038, + "rewards/chosen": -0.1790018998659574, + "rewards/margins": 3.2555804590464605, + "rewards/rejected": -3.434582358912418, + "step": 33 + }, + { + "epoch": 0.05884898312418866, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 4.857142857142858e-06, + "logits/chosen": 334761.73333333334, + "logits/rejected": -6532100.705882353, + "logps/chosen": -111.3039794921875, + "logps/rejected": -291.9061638327206, + "loss": 0.3181, + "rewards/chosen": -0.061692579587300615, + "rewards/margins": 3.2174934447980394, + "rewards/rejected": -3.27918602438534, + "step": 34 + }, + { + "epoch": 0.06057983556901774, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5367810.0, + "logits/rejected": -11132618.0, + "logps/chosen": -157.48095703125, + "logps/rejected": -343.7468566894531, + "loss": 0.3208, + "rewards/chosen": -0.24039308726787567, + "rewards/margins": 3.439171150326729, + "rewards/rejected": -3.6795642375946045, + "step": 35 + }, + { + "epoch": 0.06231068801384682, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4363456.888888889, + "logits/rejected": -3689830.285714286, + "logps/chosen": -200.380126953125, + "logps/rejected": -326.18729073660717, + "loss": 0.3453, + "rewards/chosen": -0.15904908710055882, + "rewards/margins": 3.3486854840838722, + "rewards/rejected": -3.507734571184431, + "step": 36 + }, + { + "epoch": 0.0640415404586759, + "grad_norm": 24.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14376078.4, + "logits/rejected": -13662346.666666666, + "logps/chosen": -232.297265625, + "logps/rejected": -266.85874430338544, + "loss": 0.3776, + "rewards/chosen": -0.15787798166275024, + "rewards/margins": 3.1688521107037864, + "rewards/rejected": -3.3267300923665366, + "step": 37 + }, + { + "epoch": 0.06577239290350498, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9234006.4, + "logits/rejected": 8716803.764705881, + "logps/chosen": -195.22076822916668, + "logps/rejected": -223.59991096047793, + "loss": 0.3266, + "rewards/chosen": 0.033547862370808916, + "rewards/margins": 2.3299624059714525, + "rewards/rejected": -2.2964145436006436, + "step": 38 + }, + { + "epoch": 0.06750324534833406, + "grad_norm": 23.5, + "kl": 0.22034478187561035, + "learning_rate": 5e-06, + "logits/chosen": -10664284.0, + "logits/rejected": -6822504.5, + "logps/chosen": -254.58663940429688, + "logps/rejected": -303.74578857421875, + "loss": 0.3069, + "rewards/chosen": -0.025931095704436302, + "rewards/margins": 2.927818799391389, + "rewards/rejected": -2.953749895095825, + "step": 39 + }, + { + "epoch": 0.06923409779316313, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3069599.6666666665, + "logits/rejected": -6284954.8, + "logps/chosen": -277.9341634114583, + "logps/rejected": -314.748974609375, + "loss": 0.2888, + "rewards/chosen": -0.13368964195251465, + "rewards/margins": 2.755869913101196, + "rewards/rejected": -2.8895595550537108, + "step": 40 + }, + { + "epoch": 0.07096495023799221, + "grad_norm": 22.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 946472.0, + "logits/rejected": 9102588.235294119, + "logps/chosen": -248.21149088541668, + "logps/rejected": -162.85635914522058, + "loss": 0.3098, + "rewards/chosen": 0.07899113496144612, + "rewards/margins": 2.07641756067089, + "rewards/rejected": -1.9974264257094438, + "step": 41 + }, + { + "epoch": 0.07269580268282129, + "grad_norm": 21.0, + "kl": 0.07075059413909912, + "learning_rate": 5e-06, + "logits/chosen": 7388380.19047619, + "logits/rejected": -89609.45454545454, + "logps/chosen": -191.81854538690476, + "logps/rejected": -390.0617009943182, + "loss": 0.3575, + "rewards/chosen": 0.03016080175127302, + "rewards/margins": 3.0582882986440287, + "rewards/rejected": -3.028127496892756, + "step": 42 + }, + { + "epoch": 0.07442665512765037, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6876745.6, + "logits/rejected": -1067452.5, + "logps/chosen": -200.07301025390626, + "logps/rejected": -290.6153564453125, + "loss": 0.3613, + "rewards/chosen": -0.03498818874359131, + "rewards/margins": 2.9018725315729776, + "rewards/rejected": -2.936860720316569, + "step": 43 + }, + { + "epoch": 0.07615750757247945, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3785227.111111111, + "logits/rejected": 1574901.0, + "logps/chosen": -188.12406412760416, + "logps/rejected": -150.00552804129464, + "loss": 0.3361, + "rewards/chosen": 0.11941173341539171, + "rewards/margins": 2.0707845612177773, + "rewards/rejected": -1.9513728278023856, + "step": 44 + }, + { + "epoch": 0.07788836001730852, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3288104.0, + "logits/rejected": -1550288.125, + "logps/chosen": -169.7630157470703, + "logps/rejected": -244.53619384765625, + "loss": 0.3335, + "rewards/chosen": -0.1161470040678978, + "rewards/margins": 2.0714645758271217, + "rewards/rejected": -2.1876115798950195, + "step": 45 + }, + { + "epoch": 0.0796192124621376, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2523540.222222222, + "logits/rejected": 807282.2142857143, + "logps/chosen": -206.785400390625, + "logps/rejected": -197.57388741629464, + "loss": 0.3727, + "rewards/chosen": -0.07473884688483344, + "rewards/margins": 1.7757124862973652, + "rewards/rejected": -1.8504513331821986, + "step": 46 + }, + { + "epoch": 0.08135006490696668, + "grad_norm": 22.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3043348.533333333, + "logits/rejected": 10325686.588235294, + "logps/chosen": -231.226220703125, + "logps/rejected": -169.99207261029412, + "loss": 0.3148, + "rewards/chosen": 0.14197413126627603, + "rewards/margins": 2.014080571193321, + "rewards/rejected": -1.872106439927045, + "step": 47 + }, + { + "epoch": 0.08308091735179576, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1849486.9411764706, + "logits/rejected": -2160865.3333333335, + "logps/chosen": -169.53436638327207, + "logps/rejected": -239.082568359375, + "loss": 0.3451, + "rewards/chosen": -0.1786177158355713, + "rewards/margins": 2.253294515609741, + "rewards/rejected": -2.4319122314453123, + "step": 48 + }, + { + "epoch": 0.08481176979662484, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 900298.5714285715, + "logits/rejected": -6812352.7272727275, + "logps/chosen": -166.2104259672619, + "logps/rejected": -353.5106312144886, + "loss": 0.3726, + "rewards/chosen": -0.07294606594812303, + "rewards/margins": 2.6174716572740895, + "rewards/rejected": -2.6904177232222124, + "step": 49 + }, + { + "epoch": 0.08654262224145391, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9324354.461538462, + "logits/rejected": -14040514.52631579, + "logps/chosen": -223.0580115685096, + "logps/rejected": -286.9508634868421, + "loss": 0.2674, + "rewards/chosen": 0.13786140772012564, + "rewards/margins": 2.67845962452985, + "rewards/rejected": -2.5405982168097245, + "step": 50 + }, + { + "epoch": 0.088273474686283, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 930246.5, + "logits/rejected": -7422215.0, + "logps/chosen": -238.2467803955078, + "logps/rejected": -259.29217529296875, + "loss": 0.294, + "rewards/chosen": 0.032952681183815, + "rewards/margins": 3.1361082941293716, + "rewards/rejected": -3.1031556129455566, + "step": 51 + }, + { + "epoch": 0.09000432713111207, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2028204.625, + "logits/rejected": -1362418.25, + "logps/chosen": -190.04937744140625, + "logps/rejected": -216.1826171875, + "loss": 0.3165, + "rewards/chosen": -0.050155334174633026, + "rewards/margins": 2.5483616068959236, + "rewards/rejected": -2.5985169410705566, + "step": 52 + }, + { + "epoch": 0.09173517957594116, + "grad_norm": 26.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1491913.3333333333, + "logits/rejected": 17979700.363636363, + "logps/chosen": -285.51971726190476, + "logps/rejected": -230.9169256036932, + "loss": 0.3878, + "rewards/chosen": -0.13681457156226748, + "rewards/margins": 2.993390062670687, + "rewards/rejected": -3.1302046342329546, + "step": 53 + }, + { + "epoch": 0.09346603202077022, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5621796.571428572, + "logits/rejected": -3721314.222222222, + "logps/chosen": -154.634521484375, + "logps/rejected": -144.23682996961804, + "loss": 0.3166, + "rewards/chosen": 0.15212011337280273, + "rewards/margins": 2.1092937787373858, + "rewards/rejected": -1.9571736653645833, + "step": 54 + }, + { + "epoch": 0.0951968844655993, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13213840.0, + "logits/rejected": -2418853.6, + "logps/chosen": -264.71543375651044, + "logps/rejected": -299.3638916015625, + "loss": 0.2498, + "rewards/chosen": 0.169629176457723, + "rewards/margins": 2.8914440949757894, + "rewards/rejected": -2.7218149185180662, + "step": 55 + }, + { + "epoch": 0.09692773691042839, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1270494.2222222222, + "logits/rejected": 3427853.1428571427, + "logps/chosen": -218.35677083333334, + "logps/rejected": -151.46371023995536, + "loss": 0.3539, + "rewards/chosen": 0.18590817186567518, + "rewards/margins": 1.9470306029395452, + "rewards/rejected": -1.76112243107387, + "step": 56 + }, + { + "epoch": 0.09865858935525747, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 960138.5, + "logits/rejected": -11440672.0, + "logps/chosen": -194.89010620117188, + "logps/rejected": -279.1370544433594, + "loss": 0.3033, + "rewards/chosen": 0.042164143174886703, + "rewards/margins": 2.7914009653031826, + "rewards/rejected": -2.749236822128296, + "step": 57 + }, + { + "epoch": 0.10038944180008655, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3341974.933333333, + "logits/rejected": -3835131.7647058824, + "logps/chosen": -99.36764322916666, + "logps/rejected": -268.3916015625, + "loss": 0.3016, + "rewards/chosen": -0.05830394426981608, + "rewards/margins": 2.6891182179544484, + "rewards/rejected": -2.7474221622242645, + "step": 58 + }, + { + "epoch": 0.10212029424491562, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5338708.2105263155, + "logits/rejected": -522996.92307692306, + "logps/chosen": -177.21720805921052, + "logps/rejected": -242.44989483173077, + "loss": 0.3425, + "rewards/chosen": -0.03957033157348633, + "rewards/margins": 3.2665699812082143, + "rewards/rejected": -3.3061403127817006, + "step": 59 + }, + { + "epoch": 0.1038511466897447, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4021004.5714285714, + "logits/rejected": 3649617.3333333335, + "logps/chosen": -150.47998046875, + "logps/rejected": -242.73616536458334, + "loss": 0.2824, + "rewards/chosen": 0.04531372019222805, + "rewards/margins": 2.6572553756691164, + "rewards/rejected": -2.611941655476888, + "step": 60 + }, + { + "epoch": 0.10558199913457378, + "grad_norm": 20.0, + "kl": 0.0002346038818359375, + "learning_rate": 5e-06, + "logits/chosen": -4345515.0, + "logits/rejected": -8544926.0, + "logps/chosen": -253.4796142578125, + "logps/rejected": -246.94094848632812, + "loss": 0.2793, + "rewards/chosen": 0.15215471386909485, + "rewards/margins": 3.54859259724617, + "rewards/rejected": -3.396437883377075, + "step": 61 + }, + { + "epoch": 0.10731285157940286, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1196661.3333333333, + "logits/rejected": -3318421.1428571427, + "logps/chosen": -154.80669487847223, + "logps/rejected": -289.3799525669643, + "loss": 0.3092, + "rewards/chosen": 0.0665718052122328, + "rewards/margins": 3.497844584404476, + "rewards/rejected": -3.4312727791922435, + "step": 62 + }, + { + "epoch": 0.10904370402423194, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3965800.4444444445, + "logits/rejected": -11330329.142857144, + "logps/chosen": -182.16238064236111, + "logps/rejected": -249.26039341517858, + "loss": 0.3376, + "rewards/chosen": -0.1392565303378635, + "rewards/margins": 3.219647899506584, + "rewards/rejected": -3.3589044298444475, + "step": 63 + }, + { + "epoch": 0.11077455646906101, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2474531.25, + "logits/rejected": -5974583.5, + "logps/chosen": -209.27244567871094, + "logps/rejected": -302.0538635253906, + "loss": 0.325, + "rewards/chosen": 0.020520292222499847, + "rewards/margins": 3.1491325721144676, + "rewards/rejected": -3.1286122798919678, + "step": 64 + }, + { + "epoch": 0.11250540891389009, + "grad_norm": 14.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5467615.384615385, + "logits/rejected": -3365480.0, + "logps/chosen": -144.33649151141827, + "logps/rejected": -279.4246761924342, + "loss": 0.2657, + "rewards/chosen": -0.11599624156951904, + "rewards/margins": 3.2443882477910897, + "rewards/rejected": -3.3603844893606087, + "step": 65 + }, + { + "epoch": 0.11423626135871917, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1735104.4, + "logits/rejected": 5294354.666666667, + "logps/chosen": -211.195068359375, + "logps/rejected": -247.84977213541666, + "loss": 0.3939, + "rewards/chosen": -0.25041675567626953, + "rewards/margins": 3.1013142267862954, + "rewards/rejected": -3.351730982462565, + "step": 66 + }, + { + "epoch": 0.11596711380354825, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5322431.555555556, + "logits/rejected": -17504265.14285714, + "logps/chosen": -156.61607530381946, + "logps/rejected": -485.01771763392856, + "loss": 0.3061, + "rewards/chosen": 0.008888012833065458, + "rewards/margins": 3.940254797065069, + "rewards/rejected": -3.9313667842320035, + "step": 67 + }, + { + "epoch": 0.11769796624837732, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2996452.2666666666, + "logits/rejected": 11716244.705882354, + "logps/chosen": -192.19383138020834, + "logps/rejected": -302.7353515625, + "loss": 0.2934, + "rewards/chosen": 0.03241715629895528, + "rewards/margins": 2.8701628228028615, + "rewards/rejected": -2.8377456665039062, + "step": 68 + }, + { + "epoch": 0.1194288186932064, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 14927777.142857144, + "logits/rejected": -4720057.777777778, + "logps/chosen": -149.42860630580358, + "logps/rejected": -217.70494249131946, + "loss": 0.2898, + "rewards/chosen": -0.16934810365949357, + "rewards/margins": 3.1008769973875983, + "rewards/rejected": -3.270225101047092, + "step": 69 + }, + { + "epoch": 0.12115967113803548, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 44536797.538461536, + "logits/rejected": -7564164.2105263155, + "logps/chosen": -682.1751802884615, + "logps/rejected": -270.47216796875, + "loss": 0.2777, + "rewards/chosen": -0.24122038254371056, + "rewards/margins": 2.748665248816795, + "rewards/rejected": -2.989885631360506, + "step": 70 + }, + { + "epoch": 0.12289052358286456, + "grad_norm": 24.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8335642.105263158, + "logits/rejected": -9525784.615384616, + "logps/chosen": -303.1348170230263, + "logps/rejected": -280.5024601862981, + "loss": 0.3256, + "rewards/chosen": 0.11083748466090153, + "rewards/margins": 2.885276489412254, + "rewards/rejected": -2.7744390047513523, + "step": 71 + }, + { + "epoch": 0.12462137602769364, + "grad_norm": 12.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8330821.333333333, + "logits/rejected": -3045587.4, + "logps/chosen": -137.8349812825521, + "logps/rejected": -273.801513671875, + "loss": 0.1949, + "rewards/chosen": 0.5404347976048788, + "rewards/margins": 3.689098318417867, + "rewards/rejected": -3.1486635208129883, + "step": 72 + }, + { + "epoch": 0.12635222847252273, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9142004.444444444, + "logits/rejected": -3207848.8571428573, + "logps/chosen": -188.33430989583334, + "logps/rejected": -207.32901436941964, + "loss": 0.3074, + "rewards/chosen": 0.039681686295403376, + "rewards/margins": 3.252643155673194, + "rewards/rejected": -3.2129614693777904, + "step": 73 + }, + { + "epoch": 0.1280830809173518, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6030457.263157895, + "logits/rejected": -3718584.0, + "logps/chosen": -238.16568153782896, + "logps/rejected": -195.5281700721154, + "loss": 0.3596, + "rewards/chosen": 0.006416631372351395, + "rewards/margins": 2.2222877086898096, + "rewards/rejected": -2.215871077317458, + "step": 74 + }, + { + "epoch": 0.12981393336218086, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3802380.705882353, + "logits/rejected": 7472531.2, + "logps/chosen": -168.80958467371323, + "logps/rejected": -285.90989583333334, + "loss": 0.3219, + "rewards/chosen": -0.224185635061825, + "rewards/margins": 2.7274849480273673, + "rewards/rejected": -2.9516705830891925, + "step": 75 + }, + { + "epoch": 0.13154478580700996, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4579526.5, + "logits/rejected": 6935348.0, + "logps/chosen": -246.0178985595703, + "logps/rejected": -326.98834228515625, + "loss": 0.2722, + "rewards/chosen": 0.10884374380111694, + "rewards/margins": 3.900286853313446, + "rewards/rejected": -3.791443109512329, + "step": 76 + }, + { + "epoch": 0.13327563825183902, + "grad_norm": 23.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2948165.6, + "logits/rejected": -10498065.88235294, + "logps/chosen": -323.5631510416667, + "logps/rejected": -268.2412683823529, + "loss": 0.2814, + "rewards/chosen": -0.016830217838287354, + "rewards/margins": 3.3728826207273146, + "rewards/rejected": -3.389712838565602, + "step": 77 + }, + { + "epoch": 0.13500649069666812, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3724001.263157895, + "logits/rejected": -4905061.230769231, + "logps/chosen": -196.80241313733552, + "logps/rejected": -246.7076697716346, + "loss": 0.3493, + "rewards/chosen": 0.13470386203966642, + "rewards/margins": 2.7459867975489813, + "rewards/rejected": -2.611282935509315, + "step": 78 + }, + { + "epoch": 0.1367373431414972, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 882787.5, + "logits/rejected": -2752756.8333333335, + "logps/chosen": -152.87762451171875, + "logps/rejected": -143.17495727539062, + "loss": 0.3448, + "rewards/chosen": -0.04928714632987976, + "rewards/margins": 3.1339206834634146, + "rewards/rejected": -3.1832078297932944, + "step": 79 + }, + { + "epoch": 0.13846819558632625, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10046429.6, + "logits/rejected": -9446046.545454545, + "logps/chosen": -211.123828125, + "logps/rejected": -224.4654873934659, + "loss": 0.2231, + "rewards/chosen": -0.09817437529563904, + "rewards/margins": 3.1476648303595454, + "rewards/rejected": -3.2458392056551846, + "step": 80 + }, + { + "epoch": 0.14019904803115535, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1529026.8235294118, + "logits/rejected": -18009732.266666666, + "logps/chosen": -215.9851505055147, + "logps/rejected": -285.50341796875, + "loss": 0.3038, + "rewards/chosen": 0.07493850062875186, + "rewards/margins": 3.3579501278260175, + "rewards/rejected": -3.283011627197266, + "step": 81 + }, + { + "epoch": 0.14192990047598442, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2760504.3076923075, + "logits/rejected": -6295609.684210527, + "logps/chosen": -247.9722618689904, + "logps/rejected": -289.9179173519737, + "loss": 0.2071, + "rewards/chosen": 0.2483532978938176, + "rewards/margins": 3.7582735185198453, + "rewards/rejected": -3.509920220626028, + "step": 82 + }, + { + "epoch": 0.1436607529208135, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12023732.923076924, + "logits/rejected": -5987361.263157895, + "logps/chosen": -222.31482872596155, + "logps/rejected": -229.85079152960526, + "loss": 0.2564, + "rewards/chosen": -0.01725879082312951, + "rewards/margins": 3.232607895546114, + "rewards/rejected": -3.2498666863692436, + "step": 83 + }, + { + "epoch": 0.14539160536564258, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 612477.6842105263, + "logits/rejected": 610641.0769230769, + "logps/chosen": -134.88838918585526, + "logps/rejected": -210.28667743389423, + "loss": 0.3361, + "rewards/chosen": -0.02787588772020842, + "rewards/margins": 2.743228191306234, + "rewards/rejected": -2.7711040790264425, + "step": 84 + }, + { + "epoch": 0.14712245781047165, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8578859.294117646, + "logits/rejected": -4098941.8666666667, + "logps/chosen": -180.28768382352942, + "logps/rejected": -149.85576171875, + "loss": 0.3247, + "rewards/chosen": 0.19388238121481502, + "rewards/margins": 2.1001341614068725, + "rewards/rejected": -1.9062517801920573, + "step": 85 + }, + { + "epoch": 0.14885331025530074, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1573394.5882352942, + "logits/rejected": 10871098.666666666, + "logps/chosen": -191.07115981158088, + "logps/rejected": -244.60188802083334, + "loss": 0.3206, + "rewards/chosen": 0.2755900551291073, + "rewards/margins": 2.1674577563416726, + "rewards/rejected": -1.891867701212565, + "step": 86 + }, + { + "epoch": 0.1505841627001298, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1688797.142857143, + "logits/rejected": -1483586.6666666667, + "logps/chosen": -255.131103515625, + "logps/rejected": -332.4247233072917, + "loss": 0.2465, + "rewards/chosen": 0.21271177700587682, + "rewards/margins": 3.036478909235152, + "rewards/rejected": -2.8237671322292752, + "step": 87 + }, + { + "epoch": 0.1523150151449589, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15565679.0, + "logits/rejected": -2823755.25, + "logps/chosen": -226.8282012939453, + "logps/rejected": -287.0397644042969, + "loss": 0.2921, + "rewards/chosen": 0.2816123962402344, + "rewards/margins": 2.9265496730804443, + "rewards/rejected": -2.64493727684021, + "step": 88 + }, + { + "epoch": 0.15404586758978797, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -413132.4, + "logits/rejected": -10101343.05882353, + "logps/chosen": -230.32145182291666, + "logps/rejected": -227.8623764935662, + "loss": 0.3061, + "rewards/chosen": 0.1068873405456543, + "rewards/margins": 2.7386797456180347, + "rewards/rejected": -2.6317924050723804, + "step": 89 + }, + { + "epoch": 0.15577672003461704, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2141461.846153846, + "logits/rejected": -5844247.157894737, + "logps/chosen": -115.6215350811298, + "logps/rejected": -274.47286184210526, + "loss": 0.2861, + "rewards/chosen": -0.02882493459261381, + "rewards/margins": 2.667185855780536, + "rewards/rejected": -2.69601079037315, + "step": 90 + }, + { + "epoch": 0.15750757247944613, + "grad_norm": 25.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12380478.545454545, + "logits/rejected": -8278526.4, + "logps/chosen": -197.9210759943182, + "logps/rejected": -223.8676513671875, + "loss": 0.3626, + "rewards/chosen": 0.14342746951363303, + "rewards/margins": 3.160957529328086, + "rewards/rejected": -3.017530059814453, + "step": 91 + }, + { + "epoch": 0.1592384249242752, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13449718.588235294, + "logits/rejected": -4683606.933333334, + "logps/chosen": -317.87795840992646, + "logps/rejected": -267.261279296875, + "loss": 0.2726, + "rewards/chosen": 0.3811823059530819, + "rewards/margins": 3.948626662235634, + "rewards/rejected": -3.567444356282552, + "step": 92 + }, + { + "epoch": 0.1609692773691043, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -543169.375, + "logits/rejected": -1371199.125, + "logps/chosen": -150.7501678466797, + "logps/rejected": -181.0672149658203, + "loss": 0.3291, + "rewards/chosen": -0.053351566195487976, + "rewards/margins": 2.6354714184999466, + "rewards/rejected": -2.6888229846954346, + "step": 93 + }, + { + "epoch": 0.16270012981393336, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2523244.0, + "logits/rejected": 2864016.380952381, + "logps/chosen": -234.7637606534091, + "logps/rejected": -178.11604817708334, + "loss": 0.2795, + "rewards/chosen": -0.16940477761355313, + "rewards/margins": 2.3981288062545643, + "rewards/rejected": -2.5675335838681175, + "step": 94 + }, + { + "epoch": 0.16443098225876243, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3101979.2, + "logits/rejected": -5256099.0, + "logps/chosen": -156.21822509765624, + "logps/rejected": -249.0777791341146, + "loss": 0.3557, + "rewards/chosen": -0.0824066936969757, + "rewards/margins": 3.3445211907227836, + "rewards/rejected": -3.4269278844197593, + "step": 95 + }, + { + "epoch": 0.16616183470359153, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7612368.7272727275, + "logits/rejected": -5602336.761904762, + "logps/chosen": -176.8388338955966, + "logps/rejected": -257.1624348958333, + "loss": 0.2211, + "rewards/chosen": 0.0035542053255167875, + "rewards/margins": 3.556209743248694, + "rewards/rejected": -3.5526555379231772, + "step": 96 + }, + { + "epoch": 0.1678926871484206, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1705522.0, + "logits/rejected": -19220438.0, + "logps/chosen": -229.12918090820312, + "logps/rejected": -301.3127746582031, + "loss": 0.2716, + "rewards/chosen": 0.30286985635757446, + "rewards/margins": 3.395688831806183, + "rewards/rejected": -3.0928189754486084, + "step": 97 + }, + { + "epoch": 0.1696235395932497, + "grad_norm": 24.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 12111708.307692308, + "logits/rejected": -993389.2631578947, + "logps/chosen": -285.0191180889423, + "logps/rejected": -165.50485711348685, + "loss": 0.2834, + "rewards/chosen": 0.09072128626016471, + "rewards/margins": 2.5426856144237133, + "rewards/rejected": -2.4519643281635486, + "step": 98 + }, + { + "epoch": 0.17135439203807876, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6123332.705882353, + "logits/rejected": -4620013.866666666, + "logps/chosen": -249.7477596507353, + "logps/rejected": -251.340185546875, + "loss": 0.3179, + "rewards/chosen": 0.006986297228757073, + "rewards/margins": 2.8065402319618302, + "rewards/rejected": -2.799553934733073, + "step": 99 + }, + { + "epoch": 0.17308524448290782, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3662578.0, + "logits/rejected": -3268372.75, + "logps/chosen": -159.33383178710938, + "logps/rejected": -199.64552307128906, + "loss": 0.3467, + "rewards/chosen": -0.1916092038154602, + "rewards/margins": 2.2172593474388123, + "rewards/rejected": -2.4088685512542725, + "step": 100 + }, + { + "epoch": 0.17481609692773692, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11227556.0, + "logits/rejected": -4947570.4, + "logps/chosen": -226.8167521158854, + "logps/rejected": -230.9011474609375, + "loss": 0.2496, + "rewards/chosen": 0.0410018265247345, + "rewards/margins": 3.1503684341907503, + "rewards/rejected": -3.109366607666016, + "step": 101 + }, + { + "epoch": 0.176546949372566, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4914299.368421053, + "logits/rejected": -4945337.846153846, + "logps/chosen": -197.87137643914474, + "logps/rejected": -258.1477614182692, + "loss": 0.3485, + "rewards/chosen": -0.031160028357254833, + "rewards/margins": 3.5723934501771506, + "rewards/rejected": -3.603553478534405, + "step": 102 + }, + { + "epoch": 0.17827780181739505, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7837846.222222222, + "logits/rejected": -11910576.0, + "logps/chosen": -174.4716796875, + "logps/rejected": -307.66469029017856, + "loss": 0.3231, + "rewards/chosen": -0.013415685130490197, + "rewards/margins": 2.68944691819331, + "rewards/rejected": -2.7028626033238004, + "step": 103 + }, + { + "epoch": 0.18000865426222415, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2126388.0, + "logits/rejected": 277817.7894736842, + "logps/chosen": -284.61337515024036, + "logps/rejected": -307.56527549342104, + "loss": 0.2237, + "rewards/chosen": 0.27964045451237607, + "rewards/margins": 3.5338759731184615, + "rewards/rejected": -3.2542355186060856, + "step": 104 + }, + { + "epoch": 0.18173950670705322, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1334305.875, + "logits/rejected": -5412106.0, + "logps/chosen": -228.7758331298828, + "logps/rejected": -258.7738342285156, + "loss": 0.3184, + "rewards/chosen": -0.1122078001499176, + "rewards/margins": 2.8912404477596283, + "rewards/rejected": -3.003448247909546, + "step": 105 + }, + { + "epoch": 0.1834703591518823, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10178373.05263158, + "logits/rejected": -8054584.615384615, + "logps/chosen": -173.59982781661185, + "logps/rejected": -278.5968674879808, + "loss": 0.3082, + "rewards/chosen": 0.23594951629638672, + "rewards/margins": 3.9558092997624326, + "rewards/rejected": -3.719859783466046, + "step": 106 + }, + { + "epoch": 0.18520121159671138, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7455461.647058823, + "logits/rejected": -10272849.066666666, + "logps/chosen": -190.27541934742646, + "logps/rejected": -201.00078125, + "loss": 0.3043, + "rewards/chosen": 0.15676203896017635, + "rewards/margins": 2.838399357889213, + "rewards/rejected": -2.6816373189290363, + "step": 107 + }, + { + "epoch": 0.18693206404154045, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8889729.6, + "logits/rejected": -3655384.0, + "logps/chosen": -190.6529541015625, + "logps/rejected": -243.38692220052084, + "loss": 0.3275, + "rewards/chosen": 0.024685271084308624, + "rewards/margins": 3.879149484137694, + "rewards/rejected": -3.8544642130533853, + "step": 108 + }, + { + "epoch": 0.18866291648636954, + "grad_norm": 24.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1406335.5789473683, + "logits/rejected": -5790012.307692308, + "logps/chosen": -160.40576171875, + "logps/rejected": -207.287353515625, + "loss": 0.3181, + "rewards/chosen": -0.008714937850048668, + "rewards/margins": 3.360927466559507, + "rewards/rejected": -3.3696424044095554, + "step": 109 + }, + { + "epoch": 0.1903937689311986, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1734898.1333333333, + "logits/rejected": -7056189.176470588, + "logps/chosen": -146.405810546875, + "logps/rejected": -183.34127987132354, + "loss": 0.3017, + "rewards/chosen": 0.023581977685292563, + "rewards/margins": 2.700872501438739, + "rewards/rejected": -2.6772905237534466, + "step": 110 + }, + { + "epoch": 0.1921246213760277, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 531886.4615384615, + "logits/rejected": -6064568.421052632, + "logps/chosen": -133.6158728966346, + "logps/rejected": -266.82632606907896, + "loss": 0.2827, + "rewards/chosen": -0.16753161870516264, + "rewards/margins": 2.9463925081708653, + "rewards/rejected": -3.113924126876028, + "step": 111 + }, + { + "epoch": 0.19385547382085677, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7024642.4, + "logits/rejected": -11717272.727272727, + "logps/chosen": -179.82689208984374, + "logps/rejected": -256.05178000710225, + "loss": 0.1916, + "rewards/chosen": 0.08213082551956177, + "rewards/margins": 3.862654645876451, + "rewards/rejected": -3.780523820356889, + "step": 112 + }, + { + "epoch": 0.19558632626568584, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7312199.5, + "logits/rejected": -9691646.0, + "logps/chosen": -190.84982299804688, + "logps/rejected": -189.02178955078125, + "loss": 0.2895, + "rewards/chosen": -0.05902346968650818, + "rewards/margins": 3.4700850546360016, + "rewards/rejected": -3.5291085243225098, + "step": 113 + }, + { + "epoch": 0.19731717871051493, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8977872.0, + "logits/rejected": -2104818.5, + "logps/chosen": -153.08140563964844, + "logps/rejected": -362.2707824707031, + "loss": 0.2608, + "rewards/chosen": 0.1350196748971939, + "rewards/margins": 4.345773592591286, + "rewards/rejected": -4.210753917694092, + "step": 114 + }, + { + "epoch": 0.199048031155344, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15021820.444444444, + "logits/rejected": 3075554.8571428573, + "logps/chosen": -254.52723524305554, + "logps/rejected": -324.88724190848217, + "loss": 0.2895, + "rewards/chosen": 0.18979620933532715, + "rewards/margins": 4.077897787094116, + "rewards/rejected": -3.888101577758789, + "step": 115 + }, + { + "epoch": 0.2007788836001731, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8322480.0, + "logits/rejected": 2723888.4210526315, + "logps/chosen": -225.67595027043268, + "logps/rejected": -165.18586811266448, + "loss": 0.2731, + "rewards/chosen": 0.10532364478478065, + "rewards/margins": 3.168428977008773, + "rewards/rejected": -3.0631053322239925, + "step": 116 + }, + { + "epoch": 0.20250973604500216, + "grad_norm": 13.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3385337.6, + "logits/rejected": -4593669.454545454, + "logps/chosen": -155.44390869140625, + "logps/rejected": -335.2124689275568, + "loss": 0.2003, + "rewards/chosen": -0.034807294607162476, + "rewards/margins": 4.202970068563115, + "rewards/rejected": -4.237777363170277, + "step": 117 + }, + { + "epoch": 0.20424058848983123, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2332099.8181818184, + "logits/rejected": -2151819.8, + "logps/chosen": -175.2345525568182, + "logps/rejected": -296.2167724609375, + "loss": 0.3551, + "rewards/chosen": 0.0834602876143022, + "rewards/margins": 4.01979642347856, + "rewards/rejected": -3.936336135864258, + "step": 118 + }, + { + "epoch": 0.20597144093466033, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1813728.4210526317, + "logits/rejected": -1663898.1538461538, + "logps/chosen": -162.3670076069079, + "logps/rejected": -307.88326322115387, + "loss": 0.2952, + "rewards/chosen": 0.22818475020559212, + "rewards/margins": 3.5458337111994322, + "rewards/rejected": -3.31764896099384, + "step": 119 + }, + { + "epoch": 0.2077022933794894, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5888752.533333333, + "logits/rejected": -2605564.705882353, + "logps/chosen": -156.10784505208332, + "logps/rejected": -253.71030560661765, + "loss": 0.3016, + "rewards/chosen": -0.10760652224222819, + "rewards/margins": 3.660627281899546, + "rewards/rejected": -3.768233804141774, + "step": 120 + }, + { + "epoch": 0.2094331458243185, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7869675.076923077, + "logits/rejected": -4793528.842105263, + "logps/chosen": -159.2930626502404, + "logps/rejected": -251.59606291118422, + "loss": 0.2563, + "rewards/chosen": -0.1771384019118089, + "rewards/margins": 3.1404216936242726, + "rewards/rejected": -3.3175600955360816, + "step": 121 + }, + { + "epoch": 0.21116399826914756, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3852856.0, + "logits/rejected": 1017586.2352941176, + "logps/chosen": -130.758642578125, + "logps/rejected": -310.29397403492646, + "loss": 0.2913, + "rewards/chosen": -0.17760810852050782, + "rewards/margins": 3.5648737963508155, + "rewards/rejected": -3.7424819048713234, + "step": 122 + }, + { + "epoch": 0.21289485071397662, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 394461.8947368421, + "logits/rejected": -11013158.153846154, + "logps/chosen": -211.87534693667763, + "logps/rejected": -266.10584435096155, + "loss": 0.3519, + "rewards/chosen": -0.09623796061465614, + "rewards/margins": 3.0594592017200792, + "rewards/rejected": -3.1556971623347354, + "step": 123 + }, + { + "epoch": 0.21462570315880572, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -375859.125, + "logits/rejected": -18902716.0, + "logps/chosen": -144.1176513671875, + "logps/rejected": -328.43532307942706, + "loss": 0.3587, + "rewards/chosen": -0.10335218906402588, + "rewards/margins": 3.1057602961858115, + "rewards/rejected": -3.2091124852498374, + "step": 124 + }, + { + "epoch": 0.2163565556036348, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9888419.2, + "logits/rejected": -13417634.823529411, + "logps/chosen": -207.53665364583333, + "logps/rejected": -305.8712373621324, + "loss": 0.2801, + "rewards/chosen": 0.16173944473266602, + "rewards/margins": 3.5778553738313565, + "rewards/rejected": -3.4161159290986904, + "step": 125 + }, + { + "epoch": 0.21808740804846388, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12050750.4, + "logits/rejected": -13769786.181818182, + "logps/chosen": -183.52490234375, + "logps/rejected": -278.15651633522725, + "loss": 0.2183, + "rewards/chosen": 0.14027655124664307, + "rewards/margins": 3.1395226283506914, + "rewards/rejected": -2.9992460771040483, + "step": 126 + }, + { + "epoch": 0.21981826049329295, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12270530.666666666, + "logits/rejected": -8869286.4, + "logps/chosen": -190.94978841145834, + "logps/rejected": -284.20654296875, + "loss": 0.233, + "rewards/chosen": 0.16113528609275818, + "rewards/margins": 3.4381788194179537, + "rewards/rejected": -3.2770435333251955, + "step": 127 + }, + { + "epoch": 0.22154911293812202, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15397617.066666666, + "logits/rejected": -13504640.94117647, + "logps/chosen": -223.714453125, + "logps/rejected": -284.49543313419116, + "loss": 0.2465, + "rewards/chosen": 0.2658435821533203, + "rewards/margins": 3.566493337294635, + "rewards/rejected": -3.3006497551413143, + "step": 128 + }, + { + "epoch": 0.2232799653829511, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13344241.333333334, + "logits/rejected": 3126395.2, + "logps/chosen": -236.7048543294271, + "logps/rejected": -158.71007080078124, + "loss": 0.2397, + "rewards/chosen": 0.05072679618994395, + "rewards/margins": 3.053556347886721, + "rewards/rejected": -3.0028295516967773, + "step": 129 + }, + { + "epoch": 0.22501081782778018, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1188826.4, + "logits/rejected": -7702007.529411765, + "logps/chosen": -194.06878255208332, + "logps/rejected": -285.5570714613971, + "loss": 0.287, + "rewards/chosen": -0.050044012069702146, + "rewards/margins": 2.950371789932251, + "rewards/rejected": -3.000415802001953, + "step": 130 + }, + { + "epoch": 0.22674167027260925, + "grad_norm": 24.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9090264.421052631, + "logits/rejected": -13554491.076923076, + "logps/chosen": -228.48311574835526, + "logps/rejected": -317.5834209735577, + "loss": 0.3322, + "rewards/chosen": 0.17829758242556923, + "rewards/margins": 2.7610747524601247, + "rewards/rejected": -2.5827771700345554, + "step": 131 + }, + { + "epoch": 0.22847252271743834, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11592303.111111112, + "logits/rejected": -3384686.285714286, + "logps/chosen": -236.07706705729166, + "logps/rejected": -231.01377650669642, + "loss": 0.3204, + "rewards/chosen": 0.08443025747934978, + "rewards/margins": 3.5145191181273687, + "rewards/rejected": -3.430088860648019, + "step": 132 + }, + { + "epoch": 0.2302033751622674, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4109541.0, + "logits/rejected": -10256295.0, + "logps/chosen": -150.5631561279297, + "logps/rejected": -251.85006713867188, + "loss": 0.3414, + "rewards/chosen": -0.03023519366979599, + "rewards/margins": 2.841710902750492, + "rewards/rejected": -2.871946096420288, + "step": 133 + }, + { + "epoch": 0.2319342276070965, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17335990.153846152, + "logits/rejected": -11810266.94736842, + "logps/chosen": -255.87794846754807, + "logps/rejected": -292.57041529605266, + "loss": 0.2403, + "rewards/chosen": -0.009397160548430223, + "rewards/margins": 3.8956148698021043, + "rewards/rejected": -3.9050120303505347, + "step": 134 + }, + { + "epoch": 0.23366508005192557, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3816658.933333333, + "logits/rejected": -21506733.17647059, + "logps/chosen": -192.03406575520833, + "logps/rejected": -326.34670840992646, + "loss": 0.2583, + "rewards/chosen": -0.08073126475016276, + "rewards/margins": 4.162721368378285, + "rewards/rejected": -4.243452633128447, + "step": 135 + }, + { + "epoch": 0.23539593249675464, + "grad_norm": 23.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5209688.421052632, + "logits/rejected": 4241887.076923077, + "logps/chosen": -241.80124383223685, + "logps/rejected": -267.38955453725964, + "loss": 0.3287, + "rewards/chosen": 0.1192607001254433, + "rewards/margins": 3.0787412792082254, + "rewards/rejected": -2.9594805790827823, + "step": 136 + }, + { + "epoch": 0.23712678494158373, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13466715.733333332, + "logits/rejected": -6725747.764705882, + "logps/chosen": -169.23546549479167, + "logps/rejected": -301.47449448529414, + "loss": 0.2489, + "rewards/chosen": 0.17315847078959148, + "rewards/margins": 4.150024351419187, + "rewards/rejected": -3.9768658806295956, + "step": 137 + }, + { + "epoch": 0.2388576373864128, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7070817.5, + "logits/rejected": -5078294.5, + "logps/chosen": -224.52960205078125, + "logps/rejected": -250.922119140625, + "loss": 0.299, + "rewards/chosen": -0.062333978712558746, + "rewards/margins": 3.91761764138937, + "rewards/rejected": -3.9799516201019287, + "step": 138 + }, + { + "epoch": 0.2405884898312419, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10853539.555555556, + "logits/rejected": -8516350.857142856, + "logps/chosen": -245.61515299479166, + "logps/rejected": -311.77559988839283, + "loss": 0.2919, + "rewards/chosen": 0.143819702996148, + "rewards/margins": 4.062012430221316, + "rewards/rejected": -3.9181927272251675, + "step": 139 + }, + { + "epoch": 0.24231934227607096, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23256322.666666668, + "logits/rejected": -14564356.8, + "logps/chosen": -252.69757080078125, + "logps/rejected": -324.65302734375, + "loss": 0.1926, + "rewards/chosen": 0.15854175885518393, + "rewards/margins": 4.718555339177449, + "rewards/rejected": -4.5600135803222654, + "step": 140 + }, + { + "epoch": 0.24405019472090003, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13540496.0, + "logits/rejected": -14330663.111111112, + "logps/chosen": -261.66441127232144, + "logps/rejected": -279.0700954861111, + "loss": 0.2759, + "rewards/chosen": -0.16050028800964355, + "rewards/margins": 3.2266637219323053, + "rewards/rejected": -3.387164009941949, + "step": 141 + }, + { + "epoch": 0.24578104716572913, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16567258.352941176, + "logits/rejected": -9630573.866666667, + "logps/chosen": -248.67431640625, + "logps/rejected": -195.8955078125, + "loss": 0.2962, + "rewards/chosen": 0.18976323744829962, + "rewards/margins": 3.4638149037080654, + "rewards/rejected": -3.2740516662597656, + "step": 142 + }, + { + "epoch": 0.2475118996105582, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12402342.153846154, + "logits/rejected": -7839504.842105263, + "logps/chosen": -210.70182917668268, + "logps/rejected": -379.2096525493421, + "loss": 0.2114, + "rewards/chosen": 0.16687591259296125, + "rewards/margins": 5.201281671099335, + "rewards/rejected": -5.034405758506374, + "step": 143 + }, + { + "epoch": 0.2492427520553873, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4255770.285714285, + "logits/rejected": -3999491.111111111, + "logps/chosen": -195.87995256696428, + "logps/rejected": -214.72549099392361, + "loss": 0.2982, + "rewards/chosen": -0.21987019266401017, + "rewards/margins": 2.6090391514793274, + "rewards/rejected": -2.8289093441433377, + "step": 144 + }, + { + "epoch": 0.25097360450021633, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4979902.666666667, + "logits/rejected": -11014548.57142857, + "logps/chosen": -255.74172634548611, + "logps/rejected": -255.93235560825892, + "loss": 0.3164, + "rewards/chosen": -0.09737168418036567, + "rewards/margins": 3.5333697076827764, + "rewards/rejected": -3.630741391863142, + "step": 145 + }, + { + "epoch": 0.25270445694504545, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7553311.5, + "logits/rejected": -12855062.0, + "logps/chosen": -181.3147735595703, + "logps/rejected": -319.3401184082031, + "loss": 0.2509, + "rewards/chosen": 0.28162485361099243, + "rewards/margins": 4.752773344516754, + "rewards/rejected": -4.471148490905762, + "step": 146 + }, + { + "epoch": 0.2544353093898745, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18317777.6, + "logits/rejected": -8850414.545454545, + "logps/chosen": -208.5265380859375, + "logps/rejected": -253.0011319247159, + "loss": 0.2135, + "rewards/chosen": 0.22829954624176024, + "rewards/margins": 3.561434630914168, + "rewards/rejected": -3.3331350846724077, + "step": 147 + }, + { + "epoch": 0.2561661618347036, + "grad_norm": 22.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 7776780.0, + "logits/rejected": -13990768.0, + "logps/chosen": -222.059326171875, + "logps/rejected": -333.6552734375, + "loss": 0.3596, + "rewards/chosen": -0.05462043881416321, + "rewards/margins": 2.7295163333415986, + "rewards/rejected": -2.7841367721557617, + "step": 148 + }, + { + "epoch": 0.25789701427953265, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -501711.76470588235, + "logits/rejected": -4720805.333333333, + "logps/chosen": -165.69982192095588, + "logps/rejected": -258.313623046875, + "loss": 0.3126, + "rewards/chosen": -0.00996632085126989, + "rewards/margins": 2.9709932535302404, + "rewards/rejected": -2.9809595743815103, + "step": 149 + }, + { + "epoch": 0.2596278667243617, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11498374.153846154, + "logits/rejected": -12492654.315789474, + "logps/chosen": -136.2483191856971, + "logps/rejected": -243.18302837171052, + "loss": 0.27, + "rewards/chosen": -0.3298172950744629, + "rewards/margins": 2.9719581854970833, + "rewards/rejected": -3.3017754805715462, + "step": 150 + }, + { + "epoch": 0.26135871916919085, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18241306.0, + "logits/rejected": -5504833.5, + "logps/chosen": -285.53369140625, + "logps/rejected": -346.5982360839844, + "loss": 0.2638, + "rewards/chosen": 0.2393263578414917, + "rewards/margins": 3.530009150505066, + "rewards/rejected": -3.290682792663574, + "step": 151 + }, + { + "epoch": 0.2630895716140199, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9077254.588235294, + "logits/rejected": -16832766.933333334, + "logps/chosen": -218.34030330882354, + "logps/rejected": -284.46845703125, + "loss": 0.3062, + "rewards/chosen": 0.03087810558431289, + "rewards/margins": 3.274915225131839, + "rewards/rejected": -3.244037119547526, + "step": 152 + }, + { + "epoch": 0.264820424058849, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 3784820.0, + "logits/rejected": -27976580.266666666, + "logps/chosen": -176.8842342601103, + "logps/rejected": -371.0309244791667, + "loss": 0.2938, + "rewards/chosen": 0.037597624694599825, + "rewards/margins": 3.3108126006874383, + "rewards/rejected": -3.2732149759928384, + "step": 153 + }, + { + "epoch": 0.26655127650367805, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 8110033.066666666, + "logits/rejected": -6098681.411764706, + "logps/chosen": -224.60833333333332, + "logps/rejected": -270.91673368566177, + "loss": 0.2667, + "rewards/chosen": 0.19785807927449545, + "rewards/margins": 3.2095348676045736, + "rewards/rejected": -3.011676788330078, + "step": 154 + }, + { + "epoch": 0.2682821289485071, + "grad_norm": 24.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10915676.16, + "logits/rejected": -12048051.42857143, + "logps/chosen": -200.02255859375, + "logps/rejected": -363.83028738839283, + "loss": 0.4141, + "rewards/chosen": -0.20076709747314453, + "rewards/margins": 4.246241580418179, + "rewards/rejected": -4.447008677891323, + "step": 155 + }, + { + "epoch": 0.27001298139333624, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4039928.5, + "logits/rejected": -297019.3125, + "logps/chosen": -122.26431274414062, + "logps/rejected": -205.24542236328125, + "loss": 0.2976, + "rewards/chosen": 0.13761137425899506, + "rewards/margins": 2.9543447345495224, + "rewards/rejected": -2.8167333602905273, + "step": 156 + }, + { + "epoch": 0.2717438338381653, + "grad_norm": 24.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11950380.444444444, + "logits/rejected": -14319524.57142857, + "logps/chosen": -316.365234375, + "logps/rejected": -238.67583356584822, + "loss": 0.3578, + "rewards/chosen": -0.22798464033338758, + "rewards/margins": 2.9639355038839676, + "rewards/rejected": -3.191920144217355, + "step": 157 + }, + { + "epoch": 0.2734746862829944, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7220196.923076923, + "logits/rejected": -7274026.105263158, + "logps/chosen": -168.70252403846155, + "logps/rejected": -272.20877878289474, + "loss": 0.2521, + "rewards/chosen": 0.034194111824035645, + "rewards/margins": 3.5594172916914286, + "rewards/rejected": -3.525223179867393, + "step": 158 + }, + { + "epoch": 0.27520553872782344, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7790591.111111111, + "logits/rejected": -4348877.714285715, + "logps/chosen": -150.17240397135416, + "logps/rejected": -257.23592703683033, + "loss": 0.2867, + "rewards/chosen": 0.2716523011525472, + "rewards/margins": 3.2814045747121177, + "rewards/rejected": -3.0097522735595703, + "step": 159 + }, + { + "epoch": 0.2769363911726525, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6992461.0, + "logits/rejected": -9803589.0, + "logps/chosen": -197.62159729003906, + "logps/rejected": -200.63839721679688, + "loss": 0.3119, + "rewards/chosen": 0.2022910714149475, + "rewards/margins": 2.61891371011734, + "rewards/rejected": -2.4166226387023926, + "step": 160 + }, + { + "epoch": 0.27866724361748163, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2316156.0, + "logits/rejected": -6923803.294117647, + "logps/chosen": -157.97198893229168, + "logps/rejected": -226.3733340992647, + "loss": 0.2941, + "rewards/chosen": -0.181062380472819, + "rewards/margins": 3.137106035269943, + "rewards/rejected": -3.318168415742762, + "step": 161 + }, + { + "epoch": 0.2803980960623107, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1817514.6666666667, + "logits/rejected": -4328779.428571428, + "logps/chosen": -125.52289496527777, + "logps/rejected": -229.05801827566964, + "loss": 0.3201, + "rewards/chosen": 0.1051819192038642, + "rewards/margins": 2.722233724972558, + "rewards/rejected": -2.617051805768694, + "step": 162 + }, + { + "epoch": 0.28212894850713977, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7507249.411764706, + "logits/rejected": 2358673.6, + "logps/chosen": -203.8069278492647, + "logps/rejected": -239.440380859375, + "loss": 0.2975, + "rewards/chosen": 0.04498655655804802, + "rewards/margins": 3.3732679591459385, + "rewards/rejected": -3.3282814025878906, + "step": 163 + }, + { + "epoch": 0.28385980095196883, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17538291.2, + "logits/rejected": -8524349.176470589, + "logps/chosen": -203.443017578125, + "logps/rejected": -309.54041245404414, + "loss": 0.2556, + "rewards/chosen": 0.23375027974446613, + "rewards/margins": 3.705425703759287, + "rewards/rejected": -3.471675424014821, + "step": 164 + }, + { + "epoch": 0.2855906533967979, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11055366.857142856, + "logits/rejected": -1121562.3333333333, + "logps/chosen": -159.264892578125, + "logps/rejected": -286.91015625, + "loss": 0.2449, + "rewards/chosen": 0.12227598258427211, + "rewards/margins": 4.091070063530452, + "rewards/rejected": -3.9687940809461804, + "step": 165 + }, + { + "epoch": 0.287321505841627, + "grad_norm": 23.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 657442.6, + "logits/rejected": -9425391.333333334, + "logps/chosen": -220.0943603515625, + "logps/rejected": -196.21675618489584, + "loss": 0.3849, + "rewards/chosen": -0.32888593673706057, + "rewards/margins": 2.8519148190816246, + "rewards/rejected": -3.180800755818685, + "step": 166 + }, + { + "epoch": 0.2890523582864561, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4699375.555555556, + "logits/rejected": -3600353.1428571427, + "logps/chosen": -265.4497341579861, + "logps/rejected": -267.37051827566967, + "loss": 0.2924, + "rewards/chosen": 0.23302984237670898, + "rewards/margins": 3.618199280330113, + "rewards/rejected": -3.385169437953404, + "step": 167 + }, + { + "epoch": 0.29078321073128516, + "grad_norm": 24.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15453594.666666666, + "logits/rejected": -7304598.5, + "logps/chosen": -211.82305908203125, + "logps/rejected": -305.252685546875, + "loss": 0.4196, + "rewards/chosen": -0.15757346153259277, + "rewards/margins": 3.7481679916381836, + "rewards/rejected": -3.9057414531707764, + "step": 168 + }, + { + "epoch": 0.2925140631761142, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 917019.2941176471, + "logits/rejected": -11583381.333333334, + "logps/chosen": -187.60915958180146, + "logps/rejected": -256.85833333333335, + "loss": 0.2921, + "rewards/chosen": -0.004517814692328958, + "rewards/margins": 3.6537013320361864, + "rewards/rejected": -3.6582191467285154, + "step": 169 + }, + { + "epoch": 0.2942449156209433, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5889556.705882353, + "logits/rejected": -12821309.866666667, + "logps/chosen": -256.27088120404414, + "logps/rejected": -308.8744791666667, + "loss": 0.2513, + "rewards/chosen": 0.38170385360717773, + "rewards/margins": 4.123530483245849, + "rewards/rejected": -3.741826629638672, + "step": 170 + }, + { + "epoch": 0.2959757680657724, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 16747204.0, + "logits/rejected": 620209.6, + "logps/chosen": -243.82991536458334, + "logps/rejected": -253.942626953125, + "loss": 0.2081, + "rewards/chosen": 0.4542102813720703, + "rewards/margins": 3.8508056640625, + "rewards/rejected": -3.3965953826904296, + "step": 171 + }, + { + "epoch": 0.2977066205106015, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12614680.615384616, + "logits/rejected": -7096439.578947368, + "logps/chosen": -130.17988469050482, + "logps/rejected": -229.95723684210526, + "loss": 0.2725, + "rewards/chosen": 0.0035039232327387882, + "rewards/margins": 2.607151255675173, + "rewards/rejected": -2.603647332442434, + "step": 172 + }, + { + "epoch": 0.29943747295543055, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3154612.0, + "logits/rejected": -9195659.2, + "logps/chosen": -173.6625773111979, + "logps/rejected": -270.675537109375, + "loss": 0.2182, + "rewards/chosen": 0.14845428864161173, + "rewards/margins": 3.5584659616152443, + "rewards/rejected": -3.4100116729736327, + "step": 173 + }, + { + "epoch": 0.3011683254002596, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9196400.0, + "logits/rejected": -15175718.0, + "logps/chosen": -157.8928680419922, + "logps/rejected": -259.74560546875, + "loss": 0.3143, + "rewards/chosen": -0.17662523686885834, + "rewards/margins": 3.229040876030922, + "rewards/rejected": -3.4056661128997803, + "step": 174 + }, + { + "epoch": 0.3028991778450887, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15626394.352941176, + "logits/rejected": -7392939.2, + "logps/chosen": -241.25695082720588, + "logps/rejected": -236.45672200520832, + "loss": 0.2589, + "rewards/chosen": 0.28153758890488567, + "rewards/margins": 4.230858064165302, + "rewards/rejected": -3.9493204752604165, + "step": 175 + }, + { + "epoch": 0.3046300302899178, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9918681.6, + "logits/rejected": -10010532.0, + "logps/chosen": -202.339111328125, + "logps/rejected": -184.87152099609375, + "loss": 0.3503, + "rewards/chosen": -0.07323684692382812, + "rewards/margins": 3.2869134902954102, + "rewards/rejected": -3.3601503372192383, + "step": 176 + }, + { + "epoch": 0.3063608827347469, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9519585.0, + "logits/rejected": -17195120.0, + "logps/chosen": -151.671875, + "logps/rejected": -259.06878662109375, + "loss": 0.2859, + "rewards/chosen": 0.010412598960101604, + "rewards/margins": 3.337350751273334, + "rewards/rejected": -3.3269381523132324, + "step": 177 + }, + { + "epoch": 0.30809173517957594, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16214270.11764706, + "logits/rejected": -9008196.266666668, + "logps/chosen": -273.4817899816176, + "logps/rejected": -351.15856119791664, + "loss": 0.2819, + "rewards/chosen": 0.03458939930971931, + "rewards/margins": 3.560055555315579, + "rewards/rejected": -3.5254661560058596, + "step": 178 + }, + { + "epoch": 0.309822587624405, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4070659.5555555555, + "logits/rejected": 564305.2857142857, + "logps/chosen": -159.47158474392361, + "logps/rejected": -243.50922502790178, + "loss": 0.3114, + "rewards/chosen": 0.13283884525299072, + "rewards/margins": 3.35727219922202, + "rewards/rejected": -3.224433353969029, + "step": 179 + }, + { + "epoch": 0.3115534400692341, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6579336.888888889, + "logits/rejected": -11798860.57142857, + "logps/chosen": -194.06880696614584, + "logps/rejected": -265.37472098214283, + "loss": 0.3398, + "rewards/chosen": -0.14026531908247206, + "rewards/margins": 2.981163579320151, + "rewards/rejected": -3.121428898402623, + "step": 180 + }, + { + "epoch": 0.3132842925140632, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5290882.0, + "logits/rejected": -21157156.0, + "logps/chosen": -148.17227172851562, + "logps/rejected": -296.1889953613281, + "loss": 0.3044, + "rewards/chosen": -0.1293964684009552, + "rewards/margins": 3.0376605689525604, + "rewards/rejected": -3.1670570373535156, + "step": 181 + }, + { + "epoch": 0.31501514495889227, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9015065.263157895, + "logits/rejected": -6190756.923076923, + "logps/chosen": -208.37291837993422, + "logps/rejected": -260.5161884014423, + "loss": 0.3085, + "rewards/chosen": 0.10386697869551809, + "rewards/margins": 3.3943309397832584, + "rewards/rejected": -3.2904639610877404, + "step": 182 + }, + { + "epoch": 0.31674599740372134, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10990801.066666666, + "logits/rejected": -6380602.352941177, + "logps/chosen": -166.50836588541668, + "logps/rejected": -261.99778837316177, + "loss": 0.2828, + "rewards/chosen": -0.09178520043690999, + "rewards/margins": 3.105228430149602, + "rewards/rejected": -3.197013630586512, + "step": 183 + }, + { + "epoch": 0.3184768498485504, + "grad_norm": 22.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15325334.857142856, + "logits/rejected": -24043242.181818184, + "logps/chosen": -213.09461030505952, + "logps/rejected": -345.95725319602275, + "loss": 0.3397, + "rewards/chosen": 0.22786199478876024, + "rewards/margins": 2.5508950675204716, + "rewards/rejected": -2.3230330727317114, + "step": 184 + }, + { + "epoch": 0.32020770229337947, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11346512.94117647, + "logits/rejected": -2746510.4, + "logps/chosen": -264.31959443933823, + "logps/rejected": -207.83736979166667, + "loss": 0.2974, + "rewards/chosen": 0.12100423083585851, + "rewards/margins": 2.763645679810468, + "rewards/rejected": -2.6426414489746093, + "step": 185 + }, + { + "epoch": 0.3219385547382086, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10335120.0, + "logits/rejected": -11680630.0, + "logps/chosen": -169.1227264404297, + "logps/rejected": -242.72560119628906, + "loss": 0.3009, + "rewards/chosen": 0.12381087243556976, + "rewards/margins": 3.1314540952444077, + "rewards/rejected": -3.007643222808838, + "step": 186 + }, + { + "epoch": 0.32366940718303766, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15741749.714285715, + "logits/rejected": -5845185.333333333, + "logps/chosen": -198.52054268973214, + "logps/rejected": -295.60541449652777, + "loss": 0.2393, + "rewards/chosen": 0.4524484021323068, + "rewards/margins": 3.021870806103661, + "rewards/rejected": -2.569422403971354, + "step": 187 + }, + { + "epoch": 0.32540025962786673, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9898615.578947369, + "logits/rejected": -2790789.846153846, + "logps/chosen": -154.5710320723684, + "logps/rejected": -181.50324894831732, + "loss": 0.3752, + "rewards/chosen": -0.13492245423166374, + "rewards/margins": 2.425167830849466, + "rewards/rejected": -2.56009028508113, + "step": 188 + }, + { + "epoch": 0.3271311120726958, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12387616.842105264, + "logits/rejected": -8763428.923076924, + "logps/chosen": -166.91708213404604, + "logps/rejected": -238.5693359375, + "loss": 0.3205, + "rewards/chosen": 0.12465482009084601, + "rewards/margins": 2.8608485503717955, + "rewards/rejected": -2.7361937302809496, + "step": 189 + }, + { + "epoch": 0.32886196451752486, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11473281.454545455, + "logits/rejected": -16561514.666666666, + "logps/chosen": -248.29432262073863, + "logps/rejected": -292.7100074404762, + "loss": 0.2314, + "rewards/chosen": 0.17398832061073996, + "rewards/margins": 3.0009191387143486, + "rewards/rejected": -2.826930818103609, + "step": 190 + }, + { + "epoch": 0.330592816962354, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8514306.823529411, + "logits/rejected": -15071434.666666666, + "logps/chosen": -131.8534725413603, + "logps/rejected": -270.87555338541665, + "loss": 0.3024, + "rewards/chosen": 0.18799910825841568, + "rewards/margins": 3.0057781406477386, + "rewards/rejected": -2.817779032389323, + "step": 191 + }, + { + "epoch": 0.33232366940718305, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1210433.894736842, + "logits/rejected": -8759064.0, + "logps/chosen": -148.5827765213816, + "logps/rejected": -183.75860126201923, + "loss": 0.3729, + "rewards/chosen": -0.010686732436481276, + "rewards/margins": 2.6109067429053154, + "rewards/rejected": -2.621593475341797, + "step": 192 + }, + { + "epoch": 0.3340545218520121, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1876100.6153846155, + "logits/rejected": -12178267.789473685, + "logps/chosen": -158.33451021634616, + "logps/rejected": -272.60916940789474, + "loss": 0.2534, + "rewards/chosen": 0.049141957209660456, + "rewards/margins": 3.0719871752657872, + "rewards/rejected": -3.0228452180561267, + "step": 193 + }, + { + "epoch": 0.3357853742968412, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9391846.0, + "logits/rejected": -7076478.5, + "logps/chosen": -197.68260192871094, + "logps/rejected": -201.0494384765625, + "loss": 0.2827, + "rewards/chosen": 0.280254989862442, + "rewards/margins": 3.4145003855228424, + "rewards/rejected": -3.1342453956604004, + "step": 194 + }, + { + "epoch": 0.33751622674167026, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19537841.066666666, + "logits/rejected": -4787949.176470588, + "logps/chosen": -203.43776041666666, + "logps/rejected": -216.7041015625, + "loss": 0.3368, + "rewards/chosen": 0.03912758032480876, + "rewards/margins": 2.348604769332736, + "rewards/rejected": -2.3094771890079273, + "step": 195 + }, + { + "epoch": 0.3392470791864994, + "grad_norm": 16.625, + "kl": 0.16342926025390625, + "learning_rate": 5e-06, + "logits/chosen": -3170068.8421052634, + "logits/rejected": -18859544.615384616, + "logps/chosen": -124.50954718338816, + "logps/rejected": -328.8591120793269, + "loss": 0.2909, + "rewards/chosen": 0.2587398478859349, + "rewards/margins": 3.9374563607127078, + "rewards/rejected": -3.6787165128267727, + "step": 196 + }, + { + "epoch": 0.34097793163132845, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 454123.25, + "logits/rejected": -4452693.0, + "logps/chosen": -233.21722412109375, + "logps/rejected": -264.53143310546875, + "loss": 0.2989, + "rewards/chosen": -0.011935576796531677, + "rewards/margins": 3.860817089676857, + "rewards/rejected": -3.8727526664733887, + "step": 197 + }, + { + "epoch": 0.3427087840761575, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3420107.466666667, + "logits/rejected": -14321411.764705881, + "logps/chosen": -135.79720052083334, + "logps/rejected": -363.07223690257354, + "loss": 0.2492, + "rewards/chosen": 0.19408594767252604, + "rewards/margins": 3.735214442832797, + "rewards/rejected": -3.541128495160271, + "step": 198 + }, + { + "epoch": 0.3444396365209866, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5829539.6, + "logits/rejected": -15839922.666666666, + "logps/chosen": -191.86925048828124, + "logps/rejected": -276.8868408203125, + "loss": 0.3512, + "rewards/chosen": 0.14528814554214478, + "rewards/margins": 2.9095884919166566, + "rewards/rejected": -2.7643003463745117, + "step": 199 + }, + { + "epoch": 0.34617048896581565, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3732642.0, + "logits/rejected": -6055160.0, + "logps/chosen": -145.93505859375, + "logps/rejected": -213.4742889404297, + "loss": 0.2826, + "rewards/chosen": 0.03191981464624405, + "rewards/margins": 3.5864234939217567, + "rewards/rejected": -3.5545036792755127, + "step": 200 + }, + { + "epoch": 0.3479013414106447, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10997382.666666666, + "logits/rejected": -13361217.6, + "logps/chosen": -214.18802897135416, + "logps/rejected": -290.353857421875, + "loss": 0.2108, + "rewards/chosen": 0.08698128660519917, + "rewards/margins": 3.8983208556969964, + "rewards/rejected": -3.811339569091797, + "step": 201 + }, + { + "epoch": 0.34963219385547384, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16647024.0, + "logits/rejected": -16219051.789473685, + "logps/chosen": -266.0360576923077, + "logps/rejected": -278.2851305509868, + "loss": 0.2141, + "rewards/chosen": 0.3400090290949895, + "rewards/margins": 4.1297148075180985, + "rewards/rejected": -3.7897057784231087, + "step": 202 + }, + { + "epoch": 0.3513630463003029, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14588352.0, + "logits/rejected": -4774448.842105263, + "logps/chosen": -257.1218825120192, + "logps/rejected": -202.72636975740133, + "loss": 0.2567, + "rewards/chosen": -0.031910451558920055, + "rewards/margins": 3.413597200322248, + "rewards/rejected": -3.445507651881168, + "step": 203 + }, + { + "epoch": 0.353093898745132, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3514067.5, + "logits/rejected": -21243462.0, + "logps/chosen": -222.9899139404297, + "logps/rejected": -302.18115234375, + "loss": 0.2852, + "rewards/chosen": -0.013363361358642578, + "rewards/margins": 3.822953462600708, + "rewards/rejected": -3.8363168239593506, + "step": 204 + }, + { + "epoch": 0.35482475118996104, + "grad_norm": 23.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 10442356.444444444, + "logits/rejected": -5046877.142857143, + "logps/chosen": -239.98092990451389, + "logps/rejected": -214.3353271484375, + "loss": 0.3217, + "rewards/chosen": 0.006140223807758755, + "rewards/margins": 3.9656730977788803, + "rewards/rejected": -3.9595328739711215, + "step": 205 + }, + { + "epoch": 0.3565556036347901, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3316098.5714285714, + "logits/rejected": -3345895.5555555555, + "logps/chosen": -141.17440359933036, + "logps/rejected": -268.8661838107639, + "loss": 0.2669, + "rewards/chosen": -0.06388027327401298, + "rewards/margins": 3.7650589526645724, + "rewards/rejected": -3.8289392259385853, + "step": 206 + }, + { + "epoch": 0.35828645607961923, + "grad_norm": 24.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10579598.476190476, + "logits/rejected": -7490005.818181818, + "logps/chosen": -215.40415736607142, + "logps/rejected": -398.5939275568182, + "loss": 0.3157, + "rewards/chosen": 0.11144000007992699, + "rewards/margins": 5.1400641011985355, + "rewards/rejected": -5.028624101118608, + "step": 207 + }, + { + "epoch": 0.3600173085244483, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2552588.470588235, + "logits/rejected": 11076235.733333332, + "logps/chosen": -175.71145450367646, + "logps/rejected": -340.4918619791667, + "loss": 0.293, + "rewards/chosen": 0.09705781235414393, + "rewards/margins": 3.936987011572894, + "rewards/rejected": -3.83992919921875, + "step": 208 + }, + { + "epoch": 0.36174816096927737, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8181181.0, + "logits/rejected": -4103885.5, + "logps/chosen": -175.54034423828125, + "logps/rejected": -195.105712890625, + "loss": 0.3223, + "rewards/chosen": 0.006002817302942276, + "rewards/margins": 2.481421146541834, + "rewards/rejected": -2.4754183292388916, + "step": 209 + }, + { + "epoch": 0.36347901341410643, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6023152.0, + "logits/rejected": -6930216.615384615, + "logps/chosen": -220.12386924342104, + "logps/rejected": -259.09130859375, + "loss": 0.3206, + "rewards/chosen": 0.02514595577591344, + "rewards/margins": 3.885440004378678, + "rewards/rejected": -3.8602940486027646, + "step": 210 + }, + { + "epoch": 0.3652098658589355, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8873345.846153846, + "logits/rejected": -4841296.421052632, + "logps/chosen": -222.28448016826923, + "logps/rejected": -257.08958675986844, + "loss": 0.2588, + "rewards/chosen": -0.17216739287743202, + "rewards/margins": 3.402585816286836, + "rewards/rejected": -3.574753209164268, + "step": 211 + }, + { + "epoch": 0.3669407183037646, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1087914.2666666666, + "logits/rejected": -13504388.705882354, + "logps/chosen": -207.50533854166667, + "logps/rejected": -327.5465303308824, + "loss": 0.2455, + "rewards/chosen": 0.09607280890146891, + "rewards/margins": 4.1472624559028475, + "rewards/rejected": -4.051189647001379, + "step": 212 + }, + { + "epoch": 0.3686715707485937, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12526222.933333334, + "logits/rejected": -2303273.882352941, + "logps/chosen": -217.412255859375, + "logps/rejected": -159.63197954963235, + "loss": 0.2619, + "rewards/chosen": 0.2408916155497233, + "rewards/margins": 3.14007298151652, + "rewards/rejected": -2.899181365966797, + "step": 213 + }, + { + "epoch": 0.37040242319342276, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6768866.105263158, + "logits/rejected": -11241085.538461538, + "logps/chosen": -209.8044305098684, + "logps/rejected": -294.76034780649036, + "loss": 0.3237, + "rewards/chosen": 0.22350662632992394, + "rewards/margins": 2.894371611869287, + "rewards/rejected": -2.670864985539363, + "step": 214 + }, + { + "epoch": 0.3721332756382518, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15369493.333333334, + "logits/rejected": 2045517.2857142857, + "logps/chosen": -224.15772840711804, + "logps/rejected": -162.84868512834822, + "loss": 0.2959, + "rewards/chosen": 0.2034378316667345, + "rewards/margins": 3.11308999667092, + "rewards/rejected": -2.9096521650041853, + "step": 215 + }, + { + "epoch": 0.3738641280830809, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20242884.0, + "logits/rejected": -7449432.0, + "logps/chosen": -279.4679260253906, + "logps/rejected": -204.05104064941406, + "loss": 0.2813, + "rewards/chosen": 0.30105486512184143, + "rewards/margins": 3.196340948343277, + "rewards/rejected": -2.8952860832214355, + "step": 216 + }, + { + "epoch": 0.37559498052791, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5433502.909090909, + "logits/rejected": 500345.14285714284, + "logps/chosen": -189.3955078125, + "logps/rejected": -303.5468982514881, + "loss": 0.2097, + "rewards/chosen": 0.4285439144481312, + "rewards/margins": 3.622101653705944, + "rewards/rejected": -3.1935577392578125, + "step": 217 + }, + { + "epoch": 0.3773258329727391, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8591011.333333334, + "logits/rejected": -6802850.4, + "logps/chosen": -284.0384928385417, + "logps/rejected": -196.76927490234374, + "loss": 0.2485, + "rewards/chosen": 0.3856252034505208, + "rewards/margins": 3.309361775716146, + "rewards/rejected": -2.923736572265625, + "step": 218 + }, + { + "epoch": 0.37905668541756815, + "grad_norm": 14.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4748231.2, + "logits/rejected": -13977383.272727273, + "logps/chosen": -120.3367431640625, + "logps/rejected": -259.1859685724432, + "loss": 0.2385, + "rewards/chosen": -0.07536518573760986, + "rewards/margins": 3.3064329515803945, + "rewards/rejected": -3.3817981373180044, + "step": 219 + }, + { + "epoch": 0.3807875378623972, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2363141.230769231, + "logits/rejected": -15810723.368421054, + "logps/chosen": -170.66443810096155, + "logps/rejected": -376.7901675575658, + "loss": 0.2529, + "rewards/chosen": 0.12850810931279108, + "rewards/margins": 3.2996522936261132, + "rewards/rejected": -3.1711441843133223, + "step": 220 + }, + { + "epoch": 0.3825183903072263, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11173596.8, + "logits/rejected": -14623304.470588235, + "logps/chosen": -228.55849609375, + "logps/rejected": -248.14662798713235, + "loss": 0.2426, + "rewards/chosen": 0.3422792116800944, + "rewards/margins": 3.8824749011619417, + "rewards/rejected": -3.5401956894818474, + "step": 221 + }, + { + "epoch": 0.3842492427520554, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20544094.769230768, + "logits/rejected": -15617290.105263159, + "logps/chosen": -238.24759615384616, + "logps/rejected": -270.76454564144734, + "loss": 0.2101, + "rewards/chosen": 0.37209848257211536, + "rewards/margins": 4.252398904035931, + "rewards/rejected": -3.880300421463816, + "step": 222 + }, + { + "epoch": 0.3859800951968845, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2339170.5, + "logits/rejected": -14668691.0, + "logps/chosen": -168.8272247314453, + "logps/rejected": -316.6446228027344, + "loss": 0.2904, + "rewards/chosen": 0.05972611904144287, + "rewards/margins": 3.1063586473464966, + "rewards/rejected": -3.0466325283050537, + "step": 223 + }, + { + "epoch": 0.38771094764171354, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10121347.42857143, + "logits/rejected": -9601993.777777778, + "logps/chosen": -243.92325265066964, + "logps/rejected": -305.10259331597223, + "loss": 0.2482, + "rewards/chosen": 0.2614833116531372, + "rewards/margins": 3.4309277137120566, + "rewards/rejected": -3.1694444020589194, + "step": 224 + }, + { + "epoch": 0.3894418000865426, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 8372479.157894737, + "logits/rejected": -18436361.846153848, + "logps/chosen": -128.38247841282896, + "logps/rejected": -282.94106820913464, + "loss": 0.353, + "rewards/chosen": -0.023535085351843583, + "rewards/margins": 3.7766199674200913, + "rewards/rejected": -3.800155052771935, + "step": 225 + }, + { + "epoch": 0.3911726525313717, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1354891.6842105263, + "logits/rejected": -14058054.153846154, + "logps/chosen": -161.87158203125, + "logps/rejected": -339.60730919471155, + "loss": 0.304, + "rewards/chosen": 0.12897560470982602, + "rewards/margins": 4.465609140241677, + "rewards/rejected": -4.336633535531851, + "step": 226 + }, + { + "epoch": 0.3929035049762008, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10190957.176470589, + "logits/rejected": -10005033.6, + "logps/chosen": -191.16943359375, + "logps/rejected": -214.92547200520832, + "loss": 0.3543, + "rewards/chosen": -0.27459220325245576, + "rewards/margins": 2.7234729822944193, + "rewards/rejected": -2.998065185546875, + "step": 227 + }, + { + "epoch": 0.39463435742102987, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12754082.133333333, + "logits/rejected": 9428894.11764706, + "logps/chosen": -160.12646484375, + "logps/rejected": -193.08636833639707, + "loss": 0.2904, + "rewards/chosen": 0.34527934392293297, + "rewards/margins": 2.9829243921766095, + "rewards/rejected": -2.6376450482536766, + "step": 228 + }, + { + "epoch": 0.39636520986585894, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11040588.631578946, + "logits/rejected": -28768635.076923076, + "logps/chosen": -202.25439453125, + "logps/rejected": -298.80213341346155, + "loss": 0.2818, + "rewards/chosen": 0.2841626719424599, + "rewards/margins": 4.040860400026144, + "rewards/rejected": -3.756697728083684, + "step": 229 + }, + { + "epoch": 0.398096062310688, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16696243.2, + "logits/rejected": 782789.7058823529, + "logps/chosen": -209.39425455729167, + "logps/rejected": -243.5696518841912, + "loss": 0.2405, + "rewards/chosen": 0.21385353406270344, + "rewards/margins": 3.5233376792832916, + "rewards/rejected": -3.3094841452205883, + "step": 230 + }, + { + "epoch": 0.39982691475551707, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6847610.857142857, + "logits/rejected": -341155.47222222225, + "logps/chosen": -145.1298566545759, + "logps/rejected": -159.03776041666666, + "loss": 0.3048, + "rewards/chosen": 0.034968899829047065, + "rewards/margins": 2.7618577683728835, + "rewards/rejected": -2.7268888685438366, + "step": 231 + }, + { + "epoch": 0.4015577672003462, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14881117.0, + "logits/rejected": -15492494.0, + "logps/chosen": -207.72711181640625, + "logps/rejected": -320.58319091796875, + "loss": 0.2664, + "rewards/chosen": 0.16509190201759338, + "rewards/margins": 4.239029794931412, + "rewards/rejected": -4.073937892913818, + "step": 232 + }, + { + "epoch": 0.40328861964517526, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5929799.529411765, + "logits/rejected": -12552819.2, + "logps/chosen": -215.2191664751838, + "logps/rejected": -290.16845703125, + "loss": 0.2607, + "rewards/chosen": 0.19065551196827607, + "rewards/margins": 4.717087295008641, + "rewards/rejected": -4.526431783040365, + "step": 233 + }, + { + "epoch": 0.40501947209000433, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 8185308.8, + "logits/rejected": -14831882.352941176, + "logps/chosen": -165.01144205729167, + "logps/rejected": -273.51809512867646, + "loss": 0.2833, + "rewards/chosen": -0.0994392474492391, + "rewards/margins": 3.8827245745004393, + "rewards/rejected": -3.9821638219496784, + "step": 234 + }, + { + "epoch": 0.4067503245348334, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12612798.315789474, + "logits/rejected": -19476348.307692308, + "logps/chosen": -231.91385690789474, + "logps/rejected": -316.7388446514423, + "loss": 0.3007, + "rewards/chosen": 0.1857273955094187, + "rewards/margins": 5.110347803787664, + "rewards/rejected": -4.924620408278245, + "step": 235 + }, + { + "epoch": 0.40848117697966246, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13798296.0, + "logits/rejected": -13150624.0, + "logps/chosen": -184.28253173828125, + "logps/rejected": -293.38729580965907, + "loss": 0.2181, + "rewards/chosen": -0.08902863264083863, + "rewards/margins": 3.767916405200958, + "rewards/rejected": -3.856945037841797, + "step": 236 + }, + { + "epoch": 0.4102120294244916, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5185518.933333334, + "logits/rejected": -10506853.647058824, + "logps/chosen": -195.71438802083333, + "logps/rejected": -266.6700080422794, + "loss": 0.2492, + "rewards/chosen": 0.16841630935668944, + "rewards/margins": 4.66889471727259, + "rewards/rejected": -4.5004784079159, + "step": 237 + }, + { + "epoch": 0.41194288186932065, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -120490.13333333333, + "logits/rejected": -4157334.588235294, + "logps/chosen": -165.81774088541667, + "logps/rejected": -338.9480985753676, + "loss": 0.2444, + "rewards/chosen": 0.16940480868021648, + "rewards/margins": 3.9526734567156026, + "rewards/rejected": -3.783268648035386, + "step": 238 + }, + { + "epoch": 0.4136737343141497, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13063847.466666667, + "logits/rejected": -7603120.0, + "logps/chosen": -217.052294921875, + "logps/rejected": -281.8909696691176, + "loss": 0.2602, + "rewards/chosen": 0.0732549508412679, + "rewards/margins": 3.9795507272084554, + "rewards/rejected": -3.9062957763671875, + "step": 239 + }, + { + "epoch": 0.4154045867589788, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13696624.0, + "logits/rejected": 21262386.82352941, + "logps/chosen": -188.62125651041666, + "logps/rejected": -326.68488625919116, + "loss": 0.2636, + "rewards/chosen": 0.14170858065287273, + "rewards/margins": 3.8223358425439575, + "rewards/rejected": -3.6806272618910847, + "step": 240 + }, + { + "epoch": 0.41713543920380786, + "grad_norm": 15.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11527736.615384616, + "logits/rejected": -13245899.789473685, + "logps/chosen": -171.26971905048077, + "logps/rejected": -316.2771638569079, + "loss": 0.2359, + "rewards/chosen": 0.039726394873399004, + "rewards/margins": 4.057520953749838, + "rewards/rejected": -4.017794558876439, + "step": 241 + }, + { + "epoch": 0.418866291648637, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10542560.0, + "logits/rejected": -15128954.0, + "logps/chosen": -197.97857666015625, + "logps/rejected": -249.15786743164062, + "loss": 0.2893, + "rewards/chosen": -0.0119645856320858, + "rewards/margins": 3.6342065073549747, + "rewards/rejected": -3.6461710929870605, + "step": 242 + }, + { + "epoch": 0.42059714409346605, + "grad_norm": 22.5, + "kl": 0.13220763206481934, + "learning_rate": 5e-06, + "logits/chosen": -17448896.0, + "logits/rejected": -10536552.533333333, + "logps/chosen": -242.55230353860293, + "logps/rejected": -180.8974609375, + "loss": 0.3269, + "rewards/chosen": 0.07533069217906278, + "rewards/margins": 3.0032341854245055, + "rewards/rejected": -2.927903493245443, + "step": 243 + }, + { + "epoch": 0.4223279965382951, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15767290.181818182, + "logits/rejected": -11785589.333333334, + "logps/chosen": -199.07419655539772, + "logps/rejected": -267.2129371279762, + "loss": 0.1908, + "rewards/chosen": 0.3069478381763805, + "rewards/margins": 3.4843519693845275, + "rewards/rejected": -3.177404131208147, + "step": 244 + }, + { + "epoch": 0.4240588489831242, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9023009.777777778, + "logits/rejected": -8307068.0, + "logps/chosen": -138.5511474609375, + "logps/rejected": -191.49358258928572, + "loss": 0.2808, + "rewards/chosen": 0.27963558832804364, + "rewards/margins": 3.714873745327904, + "rewards/rejected": -3.4352381569998607, + "step": 245 + }, + { + "epoch": 0.42578970142795325, + "grad_norm": 14.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12051193.6, + "logits/rejected": -10616396.235294119, + "logps/chosen": -171.37771809895833, + "logps/rejected": -222.20760569852942, + "loss": 0.2502, + "rewards/chosen": 0.1699681282043457, + "rewards/margins": 4.40653590595021, + "rewards/rejected": -4.236567777745864, + "step": 246 + }, + { + "epoch": 0.42752055387278237, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12088685.714285715, + "logits/rejected": -5290852.0, + "logps/chosen": -208.72258649553572, + "logps/rejected": -252.75640190972223, + "loss": 0.2659, + "rewards/chosen": 0.08243453502655029, + "rewards/margins": 3.2210644483566284, + "rewards/rejected": -3.138629913330078, + "step": 247 + }, + { + "epoch": 0.42925140631761144, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18058203.2, + "logits/rejected": -10416639.272727273, + "logps/chosen": -224.68359375, + "logps/rejected": -289.28861860795456, + "loss": 0.2142, + "rewards/chosen": -0.14810900688171386, + "rewards/margins": 3.584905880147761, + "rewards/rejected": -3.7330148870294746, + "step": 248 + }, + { + "epoch": 0.4309822587624405, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15116648.533333333, + "logits/rejected": -3318948.0, + "logps/chosen": -177.97216796875, + "logps/rejected": -264.22144990808823, + "loss": 0.2523, + "rewards/chosen": 0.038343381881713864, + "rewards/margins": 4.054038841584149, + "rewards/rejected": -4.015695459702435, + "step": 249 + }, + { + "epoch": 0.4327131112072696, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11157197.47368421, + "logits/rejected": 2493580.3076923075, + "logps/chosen": -177.07334498355263, + "logps/rejected": -290.3555438701923, + "loss": 0.3557, + "rewards/chosen": -0.19027650983710037, + "rewards/margins": 3.0934826806489273, + "rewards/rejected": -3.2837591904860277, + "step": 250 + }, + { + "epoch": 0.43444396365209864, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16668712.533333333, + "logits/rejected": -3976555.294117647, + "logps/chosen": -244.20847981770834, + "logps/rejected": -327.9715935202206, + "loss": 0.2648, + "rewards/chosen": 0.03703808784484863, + "rewards/margins": 4.71754776730257, + "rewards/rejected": -4.680509679457721, + "step": 251 + }, + { + "epoch": 0.43617481609692776, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5272862.545454546, + "logits/rejected": -6936409.904761905, + "logps/chosen": -229.92167524857953, + "logps/rejected": -213.8749534970238, + "loss": 0.2408, + "rewards/chosen": -0.30214368213306775, + "rewards/margins": 3.3398687777581153, + "rewards/rejected": -3.642012459891183, + "step": 252 + }, + { + "epoch": 0.43790566854175683, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3558934.933333333, + "logits/rejected": -2213124.2352941176, + "logps/chosen": -207.47721354166666, + "logps/rejected": -196.65370806525735, + "loss": 0.2565, + "rewards/chosen": 0.18845229148864745, + "rewards/margins": 3.44047677376691, + "rewards/rejected": -3.2520244822782627, + "step": 253 + }, + { + "epoch": 0.4396365209865859, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3256860.5714285714, + "logits/rejected": -1937111.3333333333, + "logps/chosen": -222.31358119419642, + "logps/rejected": -198.45515950520834, + "loss": 0.2842, + "rewards/chosen": -0.027769644345555986, + "rewards/margins": 3.096243832556028, + "rewards/rejected": -3.124013476901584, + "step": 254 + }, + { + "epoch": 0.44136737343141497, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2231512.0, + "logits/rejected": -15707685.714285715, + "logps/chosen": -224.28355577256946, + "logps/rejected": -377.65659877232144, + "loss": 0.283, + "rewards/chosen": 0.1279101769129435, + "rewards/margins": 4.255186188788642, + "rewards/rejected": -4.127276011875698, + "step": 255 + }, + { + "epoch": 0.44309822587624403, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3650349.0, + "logits/rejected": -25763404.0, + "logps/chosen": -175.8751220703125, + "logps/rejected": -384.6330261230469, + "loss": 0.2993, + "rewards/chosen": -0.2350717931985855, + "rewards/margins": 3.420280560851097, + "rewards/rejected": -3.6553523540496826, + "step": 256 + }, + { + "epoch": 0.4448290783210731, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8227142.095238095, + "logits/rejected": -3991596.0, + "logps/chosen": -204.65394810267858, + "logps/rejected": -230.69422496448863, + "loss": 0.3679, + "rewards/chosen": -0.06990920929681688, + "rewards/margins": 4.00661361785162, + "rewards/rejected": -4.0765228271484375, + "step": 257 + }, + { + "epoch": 0.4465599307659022, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9966890.105263159, + "logits/rejected": -14217319.384615384, + "logps/chosen": -199.6491827713816, + "logps/rejected": -275.9331242487981, + "loss": 0.3237, + "rewards/chosen": 0.01347437344099346, + "rewards/margins": 4.150257386418007, + "rewards/rejected": -4.136783012977014, + "step": 258 + }, + { + "epoch": 0.4482907832107313, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13260032.94117647, + "logits/rejected": -3421154.6666666665, + "logps/chosen": -238.68637982536765, + "logps/rejected": -267.1708658854167, + "loss": 0.2502, + "rewards/chosen": 0.2879646806155934, + "rewards/margins": 4.020605773551791, + "rewards/rejected": -3.732641092936198, + "step": 259 + }, + { + "epoch": 0.45002163565556036, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20596262.85714286, + "logits/rejected": -7963298.666666667, + "logps/chosen": -236.05545479910714, + "logps/rejected": -297.91436089409723, + "loss": 0.2213, + "rewards/chosen": 0.6324899537222726, + "rewards/margins": 4.426475108615936, + "rewards/rejected": -3.7939851548936634, + "step": 260 + }, + { + "epoch": 0.4517524881003894, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14417637.333333334, + "logits/rejected": -13684048.0, + "logps/chosen": -196.896435546875, + "logps/rejected": -312.3822380514706, + "loss": 0.1969, + "rewards/chosen": 0.5348507563273112, + "rewards/margins": 4.734749868804333, + "rewards/rejected": -4.199899112477022, + "step": 261 + }, + { + "epoch": 0.4534833405452185, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24702128.0, + "logits/rejected": -9365918.222222222, + "logps/chosen": -337.36460658482144, + "logps/rejected": -256.26673719618054, + "loss": 0.2306, + "rewards/chosen": 0.24765947886875697, + "rewards/margins": 4.048638669271318, + "rewards/rejected": -3.8009791904025607, + "step": 262 + }, + { + "epoch": 0.4552141929900476, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12827164.631578946, + "logits/rejected": -11524402.461538462, + "logps/chosen": -181.87505139802633, + "logps/rejected": -344.0277569110577, + "loss": 0.2989, + "rewards/chosen": 0.03691702453713668, + "rewards/margins": 4.332692511409883, + "rewards/rejected": -4.2957754868727465, + "step": 263 + }, + { + "epoch": 0.4569450454348767, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18784899.555555556, + "logits/rejected": -12124728.0, + "logps/chosen": -233.41818576388889, + "logps/rejected": -315.662109375, + "loss": 0.3184, + "rewards/chosen": 0.08528125286102295, + "rewards/margins": 3.363411920411246, + "rewards/rejected": -3.278130667550223, + "step": 264 + }, + { + "epoch": 0.45867589787970575, + "grad_norm": 19.125, + "kl": 0.11611628532409668, + "learning_rate": 5e-06, + "logits/chosen": 1691305.142857143, + "logits/rejected": -6767852.0, + "logps/chosen": -285.6856166294643, + "logps/rejected": -247.17659505208334, + "loss": 0.2455, + "rewards/chosen": 0.24254277774265834, + "rewards/margins": 3.9829009184761657, + "rewards/rejected": -3.740358140733507, + "step": 265 + }, + { + "epoch": 0.4604067503245348, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11006114.4, + "logits/rejected": -19388294.666666668, + "logps/chosen": -188.975390625, + "logps/rejected": -325.20001220703125, + "loss": 0.3136, + "rewards/chosen": 0.19371647834777833, + "rewards/margins": 3.5510452111562096, + "rewards/rejected": -3.357328732808431, + "step": 266 + }, + { + "epoch": 0.4621376027693639, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17104797.53846154, + "logits/rejected": -6318394.105263158, + "logps/chosen": -190.21593299278845, + "logps/rejected": -221.87461451480263, + "loss": 0.2442, + "rewards/chosen": 0.2514270819150485, + "rewards/margins": 3.471239374716755, + "rewards/rejected": -3.2198122928017066, + "step": 267 + }, + { + "epoch": 0.463868455214193, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4255990.933333334, + "logits/rejected": -16446816.0, + "logps/chosen": -253.98564453125, + "logps/rejected": -270.46570542279414, + "loss": 0.2827, + "rewards/chosen": -0.036499599615732826, + "rewards/margins": 3.59961351109486, + "rewards/rejected": -3.6361131107105926, + "step": 268 + }, + { + "epoch": 0.4655993076590221, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11160579.2, + "logits/rejected": -10607611.294117646, + "logps/chosen": -202.769384765625, + "logps/rejected": -240.4682186351103, + "loss": 0.2514, + "rewards/chosen": 0.4125640551249186, + "rewards/margins": 3.6268443518993903, + "rewards/rejected": -3.2142802967744717, + "step": 269 + }, + { + "epoch": 0.46733016010385114, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13615528.888888888, + "logits/rejected": -6793550.857142857, + "logps/chosen": -177.54604763454861, + "logps/rejected": -297.06703404017856, + "loss": 0.2885, + "rewards/chosen": 0.25184231334262425, + "rewards/margins": 3.5622371454087514, + "rewards/rejected": -3.310394832066127, + "step": 270 + }, + { + "epoch": 0.4690610125486802, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2008512.4210526317, + "logits/rejected": 1065751.2307692308, + "logps/chosen": -230.33552631578948, + "logps/rejected": -225.39548903245193, + "loss": 0.2912, + "rewards/chosen": 0.33769374144704717, + "rewards/margins": 3.8641099331349977, + "rewards/rejected": -3.5264161916879506, + "step": 271 + }, + { + "epoch": 0.4707918649935093, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8079465.5, + "logits/rejected": -4624071.0, + "logps/chosen": -245.87030029296875, + "logps/rejected": -286.6881103515625, + "loss": 0.256, + "rewards/chosen": 0.18378782272338867, + "rewards/margins": 4.057176828384399, + "rewards/rejected": -3.8733890056610107, + "step": 272 + }, + { + "epoch": 0.4725227174383384, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 34063.02272727273, + "logits/rejected": 5305572.4, + "logps/chosen": -97.9369229403409, + "logps/rejected": -163.92113037109374, + "loss": 0.362, + "rewards/chosen": 0.07122220234437422, + "rewards/margins": 3.202643482251601, + "rewards/rejected": -3.1314212799072267, + "step": 273 + }, + { + "epoch": 0.47425356988316747, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8214460.0, + "logits/rejected": -21400504.0, + "logps/chosen": -159.88619995117188, + "logps/rejected": -304.96246337890625, + "loss": 0.2742, + "rewards/chosen": 0.257793664932251, + "rewards/margins": 3.4434404373168945, + "rewards/rejected": -3.1856467723846436, + "step": 274 + }, + { + "epoch": 0.47598442232799654, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9168399.157894736, + "logits/rejected": -14751639.384615384, + "logps/chosen": -186.708251953125, + "logps/rejected": -292.65147986778845, + "loss": 0.2819, + "rewards/chosen": 0.2397254642687346, + "rewards/margins": 4.295501033304191, + "rewards/rejected": -4.055775569035457, + "step": 275 + }, + { + "epoch": 0.4777152747728256, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9120151.529411765, + "logits/rejected": -2838455.7333333334, + "logps/chosen": -179.58385512408088, + "logps/rejected": -274.78151041666666, + "loss": 0.2887, + "rewards/chosen": 0.22131121859830968, + "rewards/margins": 3.1119616499134137, + "rewards/rejected": -2.890650431315104, + "step": 276 + }, + { + "epoch": 0.47944612721765467, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17451059.42857143, + "logits/rejected": -19632037.333333332, + "logps/chosen": -194.51698521205358, + "logps/rejected": -261.6064453125, + "loss": 0.255, + "rewards/chosen": 0.24481826169150217, + "rewards/margins": 3.8417675211316062, + "rewards/rejected": -3.596949259440104, + "step": 277 + }, + { + "epoch": 0.4811769796624838, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11978563.2, + "logits/rejected": -11180960.94117647, + "logps/chosen": -204.08976236979166, + "logps/rejected": -332.37023207720586, + "loss": 0.2457, + "rewards/chosen": 0.2240306536356608, + "rewards/margins": 3.5974735839694154, + "rewards/rejected": -3.3734429303337548, + "step": 278 + }, + { + "epoch": 0.48290783210731286, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4350533.866666666, + "logits/rejected": -8590544.94117647, + "logps/chosen": -151.40662434895833, + "logps/rejected": -249.31729664522058, + "loss": 0.2934, + "rewards/chosen": 0.01796001394589742, + "rewards/margins": 2.6930718967727585, + "rewards/rejected": -2.675111882826861, + "step": 279 + }, + { + "epoch": 0.48463868455214193, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10933750.4, + "logits/rejected": -15255677.176470589, + "logps/chosen": -171.53665364583333, + "logps/rejected": -308.31043198529414, + "loss": 0.2806, + "rewards/chosen": 0.030906534194946288, + "rewards/margins": 4.104628924762501, + "rewards/rejected": -4.073722390567555, + "step": 280 + }, + { + "epoch": 0.486369536996971, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8110217.142857143, + "logits/rejected": -18321370.181818184, + "logps/chosen": -182.0908435639881, + "logps/rejected": -304.40236594460225, + "loss": 0.3329, + "rewards/chosen": 0.09797722952706474, + "rewards/margins": 3.671043247371525, + "rewards/rejected": -3.5730660178444604, + "step": 281 + }, + { + "epoch": 0.48810038944180006, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8963434.0, + "logits/rejected": -6483309.0, + "logps/chosen": -224.63430786132812, + "logps/rejected": -291.1379699707031, + "loss": 0.2706, + "rewards/chosen": 0.08380473405122757, + "rewards/margins": 3.3000806644558907, + "rewards/rejected": -3.216275930404663, + "step": 282 + }, + { + "epoch": 0.4898312418866292, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9069934.0, + "logits/rejected": -22540276.0, + "logps/chosen": -141.39923095703125, + "logps/rejected": -317.710693359375, + "loss": 0.3155, + "rewards/chosen": -0.18840433657169342, + "rewards/margins": 3.7216622680425644, + "rewards/rejected": -3.910066604614258, + "step": 283 + }, + { + "epoch": 0.49156209433145825, + "grad_norm": 23.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11534782.315789474, + "logits/rejected": -14227288.615384616, + "logps/chosen": -347.19901315789474, + "logps/rejected": -233.33997521033655, + "loss": 0.2763, + "rewards/chosen": 0.6239749506900185, + "rewards/margins": 3.671866420792182, + "rewards/rejected": -3.0478914701021633, + "step": 284 + }, + { + "epoch": 0.4932929467762873, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7307938.52631579, + "logits/rejected": -6305785.846153846, + "logps/chosen": -182.99164782072367, + "logps/rejected": -234.64518855168268, + "loss": 0.3165, + "rewards/chosen": 0.0814883081536544, + "rewards/margins": 3.057790234986587, + "rewards/rejected": -2.9763019268329325, + "step": 285 + }, + { + "epoch": 0.4950237992211164, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2387972.2666666666, + "logits/rejected": -10976164.705882354, + "logps/chosen": -138.42179361979166, + "logps/rejected": -257.4625459558824, + "loss": 0.2974, + "rewards/chosen": -0.04797365665435791, + "rewards/margins": 3.496035608123331, + "rewards/rejected": -3.5440092647776886, + "step": 286 + }, + { + "epoch": 0.49675465166594546, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17446714.0, + "logits/rejected": -4344534.5, + "logps/chosen": -249.31387329101562, + "logps/rejected": -275.65106201171875, + "loss": 0.2517, + "rewards/chosen": 0.2581062614917755, + "rewards/margins": 3.798191577196121, + "rewards/rejected": -3.5400853157043457, + "step": 287 + }, + { + "epoch": 0.4984855041107746, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6731353.846153846, + "logits/rejected": -10663467.789473685, + "logps/chosen": -210.49735201322116, + "logps/rejected": -270.64185855263156, + "loss": 0.2199, + "rewards/chosen": 0.23169115873483512, + "rewards/margins": 4.13718504558208, + "rewards/rejected": -3.905493886847245, + "step": 288 + }, + { + "epoch": 0.5002163565556036, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12233035.2, + "logits/rejected": -17931900.0, + "logps/chosen": -199.65758056640624, + "logps/rejected": -295.8255208333333, + "loss": 0.3644, + "rewards/chosen": -0.10184909105300903, + "rewards/margins": 2.8798390905062354, + "rewards/rejected": -2.9816881815592446, + "step": 289 + }, + { + "epoch": 0.5019472090004327, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16349262.933333334, + "logits/rejected": -4032157.6470588236, + "logps/chosen": -178.17262369791666, + "logps/rejected": -177.91433536305146, + "loss": 0.2751, + "rewards/chosen": 0.25347185134887695, + "rewards/margins": 3.25181806788725, + "rewards/rejected": -2.998346216538373, + "step": 290 + }, + { + "epoch": 0.5036780614452618, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11697979.42857143, + "logits/rejected": -6921270.222222222, + "logps/chosen": -158.21102469308036, + "logps/rejected": -194.06583658854166, + "loss": 0.2652, + "rewards/chosen": 0.007261531693594796, + "rewards/margins": 3.3397382535631692, + "rewards/rejected": -3.3324767218695746, + "step": 291 + }, + { + "epoch": 0.5054089138900909, + "grad_norm": 22.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12343669.0, + "logits/rejected": -756113.5, + "logps/chosen": -266.39447021484375, + "logps/rejected": -275.89605712890625, + "loss": 0.2883, + "rewards/chosen": 0.10074785351753235, + "rewards/margins": 3.507503777742386, + "rewards/rejected": -3.4067559242248535, + "step": 292 + }, + { + "epoch": 0.50713976633492, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1096818.7826086956, + "logits/rejected": -1816284.4444444445, + "logps/chosen": -170.7641070822011, + "logps/rejected": -274.8728298611111, + "loss": 0.3391, + "rewards/chosen": 0.17315824135490085, + "rewards/margins": 4.264118849943225, + "rewards/rejected": -4.0909606085883246, + "step": 293 + }, + { + "epoch": 0.508870618779749, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2191678.75, + "logits/rejected": -3811606.5, + "logps/chosen": -256.00042724609375, + "logps/rejected": -234.311767578125, + "loss": 0.2786, + "rewards/chosen": 0.11058405041694641, + "rewards/margins": 3.9605853855609894, + "rewards/rejected": -3.850001335144043, + "step": 294 + }, + { + "epoch": 0.5106014712245781, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10988392.38095238, + "logits/rejected": -1123548.2727272727, + "logps/chosen": -236.61251395089286, + "logps/rejected": -139.367919921875, + "loss": 0.3743, + "rewards/chosen": -0.06987906637645903, + "rewards/margins": 2.8193492342383313, + "rewards/rejected": -2.8892283006147905, + "step": 295 + }, + { + "epoch": 0.5123323236694072, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14287615.0, + "logits/rejected": -19979746.0, + "logps/chosen": -226.96316528320312, + "logps/rejected": -362.6319885253906, + "loss": 0.23, + "rewards/chosen": 0.542628824710846, + "rewards/margins": 4.330398619174957, + "rewards/rejected": -3.7877697944641113, + "step": 296 + }, + { + "epoch": 0.5140631761142362, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5152360.666666667, + "logits/rejected": -7421411.2, + "logps/chosen": -161.2005818684896, + "logps/rejected": -297.23291015625, + "loss": 0.2261, + "rewards/chosen": 0.024489139517148335, + "rewards/margins": 3.7422232856353124, + "rewards/rejected": -3.717734146118164, + "step": 297 + }, + { + "epoch": 0.5157940285590653, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4712883.0, + "logits/rejected": -5833129.5, + "logps/chosen": -148.4366455078125, + "logps/rejected": -200.3181915283203, + "loss": 0.3333, + "rewards/chosen": -0.13919666409492493, + "rewards/margins": 2.91482612490654, + "rewards/rejected": -3.054022789001465, + "step": 298 + }, + { + "epoch": 0.5175248810038944, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8620400.0, + "logits/rejected": 13068243.42857143, + "logps/chosen": -213.53375922309027, + "logps/rejected": -233.19705636160714, + "loss": 0.3398, + "rewards/chosen": -0.27686145570543075, + "rewards/margins": 4.0018741138397695, + "rewards/rejected": -4.2787355695452005, + "step": 299 + }, + { + "epoch": 0.5192557334487234, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19145128.888888888, + "logits/rejected": -4031148.0, + "logps/chosen": -201.93617078993054, + "logps/rejected": -236.96861049107142, + "loss": 0.2821, + "rewards/chosen": 0.1381672355863783, + "rewards/margins": 4.387698572779459, + "rewards/rejected": -4.249531337193081, + "step": 300 + }, + { + "epoch": 0.5209865858935526, + "grad_norm": 15.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4231100.0, + "logits/rejected": -15453786.352941176, + "logps/chosen": -128.3352783203125, + "logps/rejected": -253.23403033088235, + "loss": 0.251, + "rewards/chosen": 0.17960381507873535, + "rewards/margins": 4.087293975493488, + "rewards/rejected": -3.907690160414752, + "step": 301 + }, + { + "epoch": 0.5227174383383817, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8237603.5, + "logits/rejected": -18602760.0, + "logps/chosen": -205.55023193359375, + "logps/rejected": -309.1470947265625, + "loss": 0.1584, + "rewards/chosen": 0.48160719871520996, + "rewards/margins": 4.667518059412639, + "rewards/rejected": -4.185910860697429, + "step": 302 + }, + { + "epoch": 0.5244482907832108, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 9731814.0, + "logits/rejected": -17929584.0, + "logps/chosen": -123.1455586751302, + "logps/rejected": -274.6130859375, + "loss": 0.2245, + "rewards/chosen": 0.08349011341730754, + "rewards/margins": 3.5951165477434794, + "rewards/rejected": -3.511626434326172, + "step": 303 + }, + { + "epoch": 0.5261791432280398, + "grad_norm": 22.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7043442.0, + "logits/rejected": -2991669.75, + "logps/chosen": -265.1047668457031, + "logps/rejected": -176.61502075195312, + "loss": 0.2763, + "rewards/chosen": 0.10809116065502167, + "rewards/margins": 2.921805664896965, + "rewards/rejected": -2.8137145042419434, + "step": 304 + }, + { + "epoch": 0.5279099956728689, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10394807.333333334, + "logits/rejected": -21426294.4, + "logps/chosen": -205.70060221354166, + "logps/rejected": -331.5609619140625, + "loss": 0.239, + "rewards/chosen": -0.04992226759592692, + "rewards/margins": 3.7992056926091515, + "rewards/rejected": -3.8491279602050783, + "step": 305 + }, + { + "epoch": 0.529640848117698, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6148032.94117647, + "logits/rejected": -20105851.733333334, + "logps/chosen": -131.34010225183823, + "logps/rejected": -299.91650390625, + "loss": 0.3114, + "rewards/chosen": -0.14229805329266718, + "rewards/margins": 3.335624513439104, + "rewards/rejected": -3.477922566731771, + "step": 306 + }, + { + "epoch": 0.531371700562527, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3961582.6666666665, + "logits/rejected": -7976039.428571428, + "logps/chosen": -179.06184895833334, + "logps/rejected": -226.24178641183036, + "loss": 0.2957, + "rewards/chosen": 0.11587488651275635, + "rewards/margins": 3.8573329618998935, + "rewards/rejected": -3.741458075387137, + "step": 307 + }, + { + "epoch": 0.5331025530073561, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1382150.6, + "logits/rejected": -11513989.333333334, + "logps/chosen": -186.05457763671876, + "logps/rejected": -280.22686767578125, + "loss": 0.3251, + "rewards/chosen": 0.19102494716644286, + "rewards/margins": 3.610140331586202, + "rewards/rejected": -3.4191153844197593, + "step": 308 + }, + { + "epoch": 0.5348334054521852, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -63331.36842105263, + "logits/rejected": -6219185.230769231, + "logps/chosen": -175.26646021792763, + "logps/rejected": -227.2190880408654, + "loss": 0.3384, + "rewards/chosen": -0.0007596478650444432, + "rewards/margins": 3.7247318025180687, + "rewards/rejected": -3.725491450383113, + "step": 309 + }, + { + "epoch": 0.5365642578970142, + "grad_norm": 24.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21141552.0, + "logits/rejected": -5755605.714285715, + "logps/chosen": -294.30162217881946, + "logps/rejected": -341.0156947544643, + "loss": 0.2665, + "rewards/chosen": 0.44200200504726833, + "rewards/margins": 4.007129786506532, + "rewards/rejected": -3.5651277814592635, + "step": 310 + }, + { + "epoch": 0.5382951103418434, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4371137.230769231, + "logits/rejected": -5206167.157894737, + "logps/chosen": -263.3343036358173, + "logps/rejected": -250.7532380756579, + "loss": 0.2655, + "rewards/chosen": -0.15584894327016977, + "rewards/margins": 3.370414243535957, + "rewards/rejected": -3.5262631868061267, + "step": 311 + }, + { + "epoch": 0.5400259627866725, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4309514.0, + "logits/rejected": -11352480.0, + "logps/chosen": -135.61445109049478, + "logps/rejected": -298.176904296875, + "loss": 0.2304, + "rewards/chosen": -0.024782342215379078, + "rewards/margins": 4.6405489260951684, + "rewards/rejected": -4.665331268310547, + "step": 312 + }, + { + "epoch": 0.5417568152315015, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6566161.454545454, + "logits/rejected": -7930920.380952381, + "logps/chosen": -161.76050914417613, + "logps/rejected": -267.10614304315476, + "loss": 0.2135, + "rewards/chosen": 0.15408051013946533, + "rewards/margins": 3.8461018346604847, + "rewards/rejected": -3.6920213245210194, + "step": 313 + }, + { + "epoch": 0.5434876676763306, + "grad_norm": 14.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10360809.142857144, + "logits/rejected": -20295100.444444444, + "logps/chosen": -132.26026262555803, + "logps/rejected": -279.2024739583333, + "loss": 0.2336, + "rewards/chosen": 0.14011728763580322, + "rewards/margins": 4.056061704953512, + "rewards/rejected": -3.9159444173177085, + "step": 314 + }, + { + "epoch": 0.5452185201211597, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16829835.29411765, + "logits/rejected": -19828985.6, + "logps/chosen": -206.92171702665442, + "logps/rejected": -321.71770833333335, + "loss": 0.2818, + "rewards/chosen": 0.2053097837111529, + "rewards/margins": 3.878471608255424, + "rewards/rejected": -3.673161824544271, + "step": 315 + }, + { + "epoch": 0.5469493725659887, + "grad_norm": 15.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2378658.5, + "logits/rejected": -9286079.0, + "logps/chosen": -97.55261993408203, + "logps/rejected": -260.53057861328125, + "loss": 0.3103, + "rewards/chosen": -0.3016693890094757, + "rewards/margins": 3.752805918455124, + "rewards/rejected": -4.0544753074646, + "step": 316 + }, + { + "epoch": 0.5486802250108178, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12954272.94117647, + "logits/rejected": -15589052.8, + "logps/chosen": -190.67186063878677, + "logps/rejected": -250.44130859375, + "loss": 0.3112, + "rewards/chosen": -0.06326676116270177, + "rewards/margins": 3.7333831716986263, + "rewards/rejected": -3.796649932861328, + "step": 317 + }, + { + "epoch": 0.5504110774556469, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21864654.933333334, + "logits/rejected": -10707897.411764706, + "logps/chosen": -284.33079427083334, + "logps/rejected": -290.6952263327206, + "loss": 0.2596, + "rewards/chosen": 0.10879329045613607, + "rewards/margins": 4.429270944408342, + "rewards/rejected": -4.320477653952206, + "step": 318 + }, + { + "epoch": 0.552141929900476, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14460608.0, + "logits/rejected": 6251302.5, + "logps/chosen": -212.59942626953125, + "logps/rejected": -303.3627624511719, + "loss": 0.2394, + "rewards/chosen": 0.2094944417476654, + "rewards/margins": 5.575104087591171, + "rewards/rejected": -5.365609645843506, + "step": 319 + }, + { + "epoch": 0.553872782345305, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26788375.57894737, + "logits/rejected": -7825840.0, + "logps/chosen": -239.8508172286184, + "logps/rejected": -207.13835261418268, + "loss": 0.3256, + "rewards/chosen": -0.13202020996495298, + "rewards/margins": 3.9472348207404258, + "rewards/rejected": -4.0792550307053785, + "step": 320 + }, + { + "epoch": 0.5556036347901342, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12613575.578947369, + "logits/rejected": -44182592.0, + "logps/chosen": -172.88951994243422, + "logps/rejected": -394.89855018028845, + "loss": 0.277, + "rewards/chosen": 0.22804290369937294, + "rewards/margins": 5.308141569376956, + "rewards/rejected": -5.080098665677584, + "step": 321 + }, + { + "epoch": 0.5573344872349633, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12038496.727272727, + "logits/rejected": -13528341.333333334, + "logps/chosen": -193.1437100497159, + "logps/rejected": -260.3921363467262, + "loss": 0.2035, + "rewards/chosen": 0.25954266027970746, + "rewards/margins": 3.798412245589417, + "rewards/rejected": -3.5388695853097096, + "step": 322 + }, + { + "epoch": 0.5590653396797923, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8107024.0, + "logits/rejected": -10226571.2, + "logps/chosen": -178.4284002130682, + "logps/rejected": -278.8669921875, + "loss": 0.3194, + "rewards/chosen": 0.31757978959517047, + "rewards/margins": 3.79486479325728, + "rewards/rejected": -3.4772850036621095, + "step": 323 + }, + { + "epoch": 0.5607961921246214, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13521629.090909092, + "logits/rejected": -14930656.0, + "logps/chosen": -160.08689186789772, + "logps/rejected": -323.24326171875, + "loss": 0.3363, + "rewards/chosen": 0.18146746808832342, + "rewards/margins": 4.055506927316839, + "rewards/rejected": -3.8740394592285154, + "step": 324 + }, + { + "epoch": 0.5625270445694505, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9283119.05882353, + "logits/rejected": -19759027.2, + "logps/chosen": -159.77513212316177, + "logps/rejected": -243.65621744791667, + "loss": 0.2922, + "rewards/chosen": 0.2326793390161851, + "rewards/margins": 2.9834321059432685, + "rewards/rejected": -2.7507527669270835, + "step": 325 + }, + { + "epoch": 0.5642578970142795, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13160847.2, + "logits/rejected": -13422400.0, + "logps/chosen": -239.468359375, + "logps/rejected": -288.320068359375, + "loss": 0.3079, + "rewards/chosen": 0.38610186576843264, + "rewards/margins": 3.338702344894409, + "rewards/rejected": -2.9526004791259766, + "step": 326 + }, + { + "epoch": 0.5659887494591086, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 6664449.777777778, + "logits/rejected": -13790154.285714285, + "logps/chosen": -132.85997178819446, + "logps/rejected": -236.58299037388392, + "loss": 0.2752, + "rewards/chosen": 0.22376439306471083, + "rewards/margins": 3.821674865389627, + "rewards/rejected": -3.5979104723249162, + "step": 327 + }, + { + "epoch": 0.5677196019039377, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5918864.571428572, + "logits/rejected": -15041781.333333334, + "logps/chosen": -83.88548060825893, + "logps/rejected": -206.26862250434027, + "loss": 0.299, + "rewards/chosen": -0.0441314663205828, + "rewards/margins": 3.410717564915854, + "rewards/rejected": -3.454849031236437, + "step": 328 + }, + { + "epoch": 0.5694504543487667, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2414931.4285714286, + "logits/rejected": -6198071.111111111, + "logps/chosen": -179.96451241629464, + "logps/rejected": -251.01491970486111, + "loss": 0.2229, + "rewards/chosen": 0.22425884859902517, + "rewards/margins": 4.415093741719685, + "rewards/rejected": -4.19083489312066, + "step": 329 + }, + { + "epoch": 0.5711813067935958, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2594426.75, + "logits/rejected": -10904819.0, + "logps/chosen": -130.1868896484375, + "logps/rejected": -286.26708984375, + "loss": 0.2606, + "rewards/chosen": 0.17086640000343323, + "rewards/margins": 4.094301134347916, + "rewards/rejected": -3.9234347343444824, + "step": 330 + }, + { + "epoch": 0.572912159238425, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11815912.0, + "logits/rejected": -15974919.0, + "logps/chosen": -220.44569396972656, + "logps/rejected": -232.99757385253906, + "loss": 0.239, + "rewards/chosen": 0.524544894695282, + "rewards/margins": 3.826286017894745, + "rewards/rejected": -3.301741123199463, + "step": 331 + }, + { + "epoch": 0.574643011683254, + "grad_norm": 25.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8336921.263157895, + "logits/rejected": -3555189.5384615385, + "logps/chosen": -313.17259457236844, + "logps/rejected": -281.8469050480769, + "loss": 0.297, + "rewards/chosen": 0.31689523395739105, + "rewards/margins": 3.4732144853846747, + "rewards/rejected": -3.1563192514272838, + "step": 332 + }, + { + "epoch": 0.5763738641280831, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8533329.142857144, + "logits/rejected": -5820462.666666667, + "logps/chosen": -214.5418701171875, + "logps/rejected": -307.847900390625, + "loss": 0.2235, + "rewards/chosen": 0.08273885931287493, + "rewards/margins": 4.703521075702849, + "rewards/rejected": -4.620782216389974, + "step": 333 + }, + { + "epoch": 0.5781047165729122, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1717640.75, + "logits/rejected": -5517426.0, + "logps/chosen": -146.82470703125, + "logps/rejected": -328.8900146484375, + "loss": 0.2928, + "rewards/chosen": 0.018802586942911148, + "rewards/margins": 3.479861918836832, + "rewards/rejected": -3.461059331893921, + "step": 334 + }, + { + "epoch": 0.5798355690177412, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1765888.888888889, + "logits/rejected": -7971482.285714285, + "logps/chosen": -191.85907660590277, + "logps/rejected": -252.76864188058036, + "loss": 0.2827, + "rewards/chosen": 0.2067281272676256, + "rewards/margins": 4.022728929443965, + "rewards/rejected": -3.8160008021763394, + "step": 335 + }, + { + "epoch": 0.5815664214625703, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10202053.818181818, + "logits/rejected": -12688995.047619049, + "logps/chosen": -251.6657049005682, + "logps/rejected": -244.08217075892858, + "loss": 0.1869, + "rewards/chosen": 0.3494947173378684, + "rewards/margins": 4.283596003726447, + "rewards/rejected": -3.9341012863885787, + "step": 336 + }, + { + "epoch": 0.5832972739073994, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18041316.0, + "logits/rejected": -9467369.0, + "logps/chosen": -183.19395446777344, + "logps/rejected": -250.51498413085938, + "loss": 0.294, + "rewards/chosen": -0.09669895470142365, + "rewards/margins": 2.7749326676130295, + "rewards/rejected": -2.871631622314453, + "step": 337 + }, + { + "epoch": 0.5850281263522285, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3072201.8, + "logits/rejected": -14306997.333333334, + "logps/chosen": -190.6428955078125, + "logps/rejected": -363.647705078125, + "loss": 0.324, + "rewards/chosen": 0.0663179337978363, + "rewards/margins": 4.742912985881169, + "rewards/rejected": -4.676595052083333, + "step": 338 + }, + { + "epoch": 0.5867589787970575, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10131879.529411765, + "logits/rejected": -14746457.6, + "logps/chosen": -232.2019473805147, + "logps/rejected": -264.80859375, + "loss": 0.2864, + "rewards/chosen": 0.21893815433277802, + "rewards/margins": 3.606466891718846, + "rewards/rejected": -3.387528737386068, + "step": 339 + }, + { + "epoch": 0.5884898312418866, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8450442.94736842, + "logits/rejected": -17696190.769230768, + "logps/chosen": -176.38082082648026, + "logps/rejected": -312.70152869591345, + "loss": 0.2979, + "rewards/chosen": 0.17910035032975047, + "rewards/margins": 3.6266969736771064, + "rewards/rejected": -3.447596623347356, + "step": 340 + }, + { + "epoch": 0.5902206836867157, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10404825.0, + "logits/rejected": -11997734.0, + "logps/chosen": -184.37428283691406, + "logps/rejected": -287.09124755859375, + "loss": 0.2813, + "rewards/chosen": 0.10754716396331787, + "rewards/margins": 3.519480586051941, + "rewards/rejected": -3.411933422088623, + "step": 341 + }, + { + "epoch": 0.5919515361315448, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1821964.8, + "logits/rejected": -3068290.3529411764, + "logps/chosen": -187.56603190104167, + "logps/rejected": -222.35047104779412, + "loss": 0.2528, + "rewards/chosen": 0.10934350490570069, + "rewards/margins": 4.271289072317235, + "rewards/rejected": -4.161945567411535, + "step": 342 + }, + { + "epoch": 0.5936823885763739, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6672596.8, + "logits/rejected": -1928376.1666666667, + "logps/chosen": -154.5575927734375, + "logps/rejected": -207.98529052734375, + "loss": 0.3436, + "rewards/chosen": 0.05052804946899414, + "rewards/margins": 3.174370606740316, + "rewards/rejected": -3.1238425572713218, + "step": 343 + }, + { + "epoch": 0.595413241021203, + "grad_norm": 17.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 5078587.076923077, + "logits/rejected": -15951186.52631579, + "logps/chosen": -203.24442232572116, + "logps/rejected": -283.1211194490132, + "loss": 0.236, + "rewards/chosen": 0.008584217383311344, + "rewards/margins": 3.9024175785572424, + "rewards/rejected": -3.893833361173931, + "step": 344 + }, + { + "epoch": 0.597144093466032, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6848732.0, + "logits/rejected": -8973029.142857144, + "logps/chosen": -144.63374837239584, + "logps/rejected": -238.96840122767858, + "loss": 0.3141, + "rewards/chosen": 0.04181914197074042, + "rewards/margins": 3.4218101283860585, + "rewards/rejected": -3.379990986415318, + "step": 345 + }, + { + "epoch": 0.5988749459108611, + "grad_norm": 22.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14234343.61904762, + "logits/rejected": -7106824.7272727275, + "logps/chosen": -258.7406296502976, + "logps/rejected": -140.10917524857953, + "loss": 0.3224, + "rewards/chosen": 0.293655758812314, + "rewards/margins": 3.0026994350152614, + "rewards/rejected": -2.7090436762029473, + "step": 346 + }, + { + "epoch": 0.6006057983556902, + "grad_norm": 13.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3528985.4545454546, + "logits/rejected": -14447954.285714285, + "logps/chosen": -179.65184437144887, + "logps/rejected": -286.34605189732144, + "loss": 0.1925, + "rewards/chosen": 0.5645605867559259, + "rewards/margins": 4.192075380515226, + "rewards/rejected": -3.6275147937593006, + "step": 347 + }, + { + "epoch": 0.6023366508005192, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10800672.94117647, + "logits/rejected": -20403803.733333334, + "logps/chosen": -175.4804256663603, + "logps/rejected": -309.93170572916665, + "loss": 0.2781, + "rewards/chosen": 0.08703799808726591, + "rewards/margins": 4.335277133829453, + "rewards/rejected": -4.248239135742187, + "step": 348 + }, + { + "epoch": 0.6040675032453483, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3859851.4285714286, + "logits/rejected": -24707605.333333332, + "logps/chosen": -242.44515555245536, + "logps/rejected": -315.48920355902777, + "loss": 0.2804, + "rewards/chosen": 0.1364647831235613, + "rewards/margins": 3.2929474796567644, + "rewards/rejected": -3.156482696533203, + "step": 349 + }, + { + "epoch": 0.6057983556901774, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7096013.47368421, + "logits/rejected": 10576553.846153846, + "logps/chosen": -209.0187859786184, + "logps/rejected": -264.02599158653845, + "loss": 0.3112, + "rewards/chosen": 0.19617369300440737, + "rewards/margins": 3.5752809482064807, + "rewards/rejected": -3.379107255202073, + "step": 350 + }, + { + "epoch": 0.6075292081350064, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10675886.11764706, + "logits/rejected": -15069337.6, + "logps/chosen": -173.53643439797793, + "logps/rejected": -283.91783854166664, + "loss": 0.268, + "rewards/chosen": 0.2641681502847111, + "rewards/margins": 4.036883655248904, + "rewards/rejected": -3.7727155049641925, + "step": 351 + }, + { + "epoch": 0.6092600605798356, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12427122.285714285, + "logits/rejected": 2862197.777777778, + "logps/chosen": -179.94796316964286, + "logps/rejected": -246.80126953125, + "loss": 0.2643, + "rewards/chosen": -0.07165707860674177, + "rewards/margins": 4.549061559495472, + "rewards/rejected": -4.620718638102214, + "step": 352 + }, + { + "epoch": 0.6109909130246647, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8503058.285714285, + "logits/rejected": -6545023.555555556, + "logps/chosen": -134.74617222377233, + "logps/rejected": -207.88804796006946, + "loss": 0.2359, + "rewards/chosen": 0.21191903523036412, + "rewards/margins": 3.6446320140172563, + "rewards/rejected": -3.4327129787868924, + "step": 353 + }, + { + "epoch": 0.6127217654694938, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11578392.615384616, + "logits/rejected": -16602821.05263158, + "logps/chosen": -289.69076772836536, + "logps/rejected": -306.3754625822368, + "loss": 0.2377, + "rewards/chosen": 0.13199646656329816, + "rewards/margins": 4.4162588727619, + "rewards/rejected": -4.284262406198602, + "step": 354 + }, + { + "epoch": 0.6144526179143228, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7813487.0, + "logits/rejected": -23554336.0, + "logps/chosen": -223.7386932373047, + "logps/rejected": -302.6226501464844, + "loss": 0.2748, + "rewards/chosen": 0.11314409971237183, + "rewards/margins": 4.6964752078056335, + "rewards/rejected": -4.583331108093262, + "step": 355 + }, + { + "epoch": 0.6161834703591519, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16157925.647058824, + "logits/rejected": -16505361.066666666, + "logps/chosen": -195.59629193474265, + "logps/rejected": -288.43287760416666, + "loss": 0.2861, + "rewards/chosen": 0.05959615286658792, + "rewards/margins": 3.963687542840546, + "rewards/rejected": -3.904091389973958, + "step": 356 + }, + { + "epoch": 0.617914322803981, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21620996.57142857, + "logits/rejected": -10515739.555555556, + "logps/chosen": -213.68040248325892, + "logps/rejected": -211.16574435763889, + "loss": 0.223, + "rewards/chosen": 0.4431783471788679, + "rewards/margins": 3.7781079193902394, + "rewards/rejected": -3.3349295722113714, + "step": 357 + }, + { + "epoch": 0.61964517524881, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8937320.888888888, + "logits/rejected": -5759075.428571428, + "logps/chosen": -178.92529296875, + "logps/rejected": -213.58454241071428, + "loss": 0.3322, + "rewards/chosen": -0.00693051020304362, + "rewards/margins": 3.373754478636242, + "rewards/rejected": -3.3806849888392856, + "step": 358 + }, + { + "epoch": 0.6213760276936391, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27906884.57142857, + "logits/rejected": -13232257.777777778, + "logps/chosen": -248.64763532366072, + "logps/rejected": -317.57953559027777, + "loss": 0.215, + "rewards/chosen": 0.2477287905556815, + "rewards/margins": 4.180022152643355, + "rewards/rejected": -3.9322933620876737, + "step": 359 + }, + { + "epoch": 0.6231068801384682, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12288800.0, + "logits/rejected": -7868215.619047619, + "logps/chosen": -132.10096324573863, + "logps/rejected": -253.1708751860119, + "loss": 0.2152, + "rewards/chosen": 0.21286071430553088, + "rewards/margins": 3.283979750298834, + "rewards/rejected": -3.0711190359933034, + "step": 360 + }, + { + "epoch": 0.6248377325832972, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13952132.923076924, + "logits/rejected": -4337719.157894737, + "logps/chosen": -184.3740234375, + "logps/rejected": -276.4233912417763, + "loss": 0.2489, + "rewards/chosen": 0.3947724929222694, + "rewards/margins": 4.056729681578725, + "rewards/rejected": -3.6619571886564555, + "step": 361 + }, + { + "epoch": 0.6265685850281264, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2559124.0, + "logits/rejected": -11727854.222222222, + "logps/chosen": -267.82090541294644, + "logps/rejected": -253.38324652777777, + "loss": 0.2538, + "rewards/chosen": -0.07954313925334386, + "rewards/margins": 4.434653044693054, + "rewards/rejected": -4.514196183946398, + "step": 362 + }, + { + "epoch": 0.6282994374729555, + "grad_norm": 21.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11280373.818181818, + "logits/rejected": -7648320.0, + "logps/chosen": -208.05641867897728, + "logps/rejected": -147.7589111328125, + "loss": 0.3312, + "rewards/chosen": 0.36006593704223633, + "rewards/margins": 3.347266674041748, + "rewards/rejected": -2.9872007369995117, + "step": 363 + }, + { + "epoch": 0.6300302899177845, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7118560.0, + "logits/rejected": -8206084.666666667, + "logps/chosen": -121.705712890625, + "logps/rejected": -264.3739827473958, + "loss": 0.3282, + "rewards/chosen": -0.10566198825836182, + "rewards/margins": 4.391827305157979, + "rewards/rejected": -4.497489293416341, + "step": 364 + }, + { + "epoch": 0.6317611423626136, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16051957.714285715, + "logits/rejected": -12511853.333333334, + "logps/chosen": -254.34000069754464, + "logps/rejected": -235.74953884548611, + "loss": 0.2525, + "rewards/chosen": 0.13838555131639754, + "rewards/margins": 3.961301370272561, + "rewards/rejected": -3.8229158189561634, + "step": 365 + }, + { + "epoch": 0.6334919948074427, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11869288.533333333, + "logits/rejected": -20554912.0, + "logps/chosen": -257.72490234375, + "logps/rejected": -271.98078469669116, + "loss": 0.2886, + "rewards/chosen": -0.261674165725708, + "rewards/margins": 4.190633591483621, + "rewards/rejected": -4.452307757209329, + "step": 366 + }, + { + "epoch": 0.6352228472522717, + "grad_norm": 22.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13831125.76, + "logits/rejected": -18872251.42857143, + "logps/chosen": -199.99576171875, + "logps/rejected": -282.732666015625, + "loss": 0.3466, + "rewards/chosen": 0.2688837432861328, + "rewards/margins": 4.29412722996303, + "rewards/rejected": -4.025243486676898, + "step": 367 + }, + { + "epoch": 0.6369536996971008, + "grad_norm": 23.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22038176.0, + "logits/rejected": -16882546.133333333, + "logps/chosen": -310.03860294117646, + "logps/rejected": -275.8414713541667, + "loss": 0.3127, + "rewards/chosen": 0.028183612753363216, + "rewards/margins": 3.60563163114529, + "rewards/rejected": -3.577448018391927, + "step": 368 + }, + { + "epoch": 0.6386845521419299, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15001413.333333334, + "logits/rejected": -13682121.6, + "logps/chosen": -274.2471923828125, + "logps/rejected": -278.1908203125, + "loss": 0.2168, + "rewards/chosen": 0.42730391025543213, + "rewards/margins": 3.9290212869644163, + "rewards/rejected": -3.501717376708984, + "step": 369 + }, + { + "epoch": 0.6404154045867589, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15972298.0, + "logits/rejected": -4823944.5, + "logps/chosen": -245.4044189453125, + "logps/rejected": -206.94114685058594, + "loss": 0.2787, + "rewards/chosen": 0.3271653950214386, + "rewards/margins": 3.061715394258499, + "rewards/rejected": -2.7345499992370605, + "step": 370 + }, + { + "epoch": 0.642146257031588, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7705861.894736842, + "logits/rejected": -28472534.153846152, + "logps/chosen": -187.05327405427633, + "logps/rejected": -364.1125676081731, + "loss": 0.3094, + "rewards/chosen": 0.010326274131473741, + "rewards/margins": 4.707725706974022, + "rewards/rejected": -4.697399432842548, + "step": 371 + }, + { + "epoch": 0.6438771094764172, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18150483.2, + "logits/rejected": -21795271.529411763, + "logps/chosen": -223.94767252604166, + "logps/rejected": -307.9017980238971, + "loss": 0.2359, + "rewards/chosen": 0.4688817660013835, + "rewards/margins": 4.511685810837092, + "rewards/rejected": -4.042804044835708, + "step": 372 + }, + { + "epoch": 0.6456079619212463, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16217340.8, + "logits/rejected": -9909528.470588235, + "logps/chosen": -238.28984375, + "logps/rejected": -222.5888671875, + "loss": 0.2546, + "rewards/chosen": 0.49732151031494143, + "rewards/margins": 3.3056646122651943, + "rewards/rejected": -2.808343101950253, + "step": 373 + }, + { + "epoch": 0.6473388143660753, + "grad_norm": 14.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4849048.307692308, + "logits/rejected": -11516577.684210526, + "logps/chosen": -101.24676983173077, + "logps/rejected": -201.8884405838816, + "loss": 0.2434, + "rewards/chosen": 0.12669064448429987, + "rewards/margins": 3.2508367546174206, + "rewards/rejected": -3.124146110133121, + "step": 374 + }, + { + "epoch": 0.6490696668109044, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28892482.666666668, + "logits/rejected": -5953007.6, + "logps/chosen": -219.9309285481771, + "logps/rejected": -300.281396484375, + "loss": 0.2221, + "rewards/chosen": 0.010543271899223328, + "rewards/margins": 4.412198850512505, + "rewards/rejected": -4.401655578613282, + "step": 375 + }, + { + "epoch": 0.6508005192557335, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21528912.94117647, + "logits/rejected": -24916744.533333335, + "logps/chosen": -239.7306698069853, + "logps/rejected": -286.5082682291667, + "loss": 0.2613, + "rewards/chosen": 0.3170791513779584, + "rewards/margins": 4.208893319672229, + "rewards/rejected": -3.8918141682942706, + "step": 376 + }, + { + "epoch": 0.6525313717005625, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1675282.3529411764, + "logits/rejected": -11170286.933333334, + "logps/chosen": -182.6720473345588, + "logps/rejected": -243.52550455729167, + "loss": 0.268, + "rewards/chosen": 0.16008928242851705, + "rewards/margins": 4.650997587278778, + "rewards/rejected": -4.490908304850261, + "step": 377 + }, + { + "epoch": 0.6542622241453916, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13637652.0, + "logits/rejected": -14450204.0, + "logps/chosen": -192.2814178466797, + "logps/rejected": -303.75457763671875, + "loss": 0.2456, + "rewards/chosen": 0.2808271050453186, + "rewards/margins": 3.9439082741737366, + "rewards/rejected": -3.663081169128418, + "step": 378 + }, + { + "epoch": 0.6559930765902207, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15642123.636363637, + "logits/rejected": -8306617.904761905, + "logps/chosen": -156.69376997514203, + "logps/rejected": -185.15342494419642, + "loss": 0.2423, + "rewards/chosen": 0.02969127893447876, + "rewards/margins": 3.1654334664344788, + "rewards/rejected": -3.1357421875, + "step": 379 + }, + { + "epoch": 0.6577239290350497, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8884224.727272727, + "logits/rejected": -14000131.047619049, + "logps/chosen": -302.678955078125, + "logps/rejected": -204.98786272321428, + "loss": 0.1865, + "rewards/chosen": 0.5055931264703925, + "rewards/margins": 3.6664908809579297, + "rewards/rejected": -3.160897754487537, + "step": 380 + }, + { + "epoch": 0.6594547814798788, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9105338.0, + "logits/rejected": -30175074.0, + "logps/chosen": -226.66571044921875, + "logps/rejected": -358.23699951171875, + "loss": 0.2645, + "rewards/chosen": 0.30727851390838623, + "rewards/margins": 4.191656708717346, + "rewards/rejected": -3.88437819480896, + "step": 381 + }, + { + "epoch": 0.661185633924708, + "grad_norm": 20.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -388777.0, + "logits/rejected": -17145814.0, + "logps/chosen": -236.92369079589844, + "logps/rejected": -263.4542541503906, + "loss": 0.287, + "rewards/chosen": -0.011938914656639099, + "rewards/margins": 3.5170871168375015, + "rewards/rejected": -3.5290260314941406, + "step": 382 + }, + { + "epoch": 0.662916486369537, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10614533.05263158, + "logits/rejected": -7054681.846153846, + "logps/chosen": -175.78060752467104, + "logps/rejected": -211.03123121995193, + "loss": 0.3054, + "rewards/chosen": 0.2919558474892064, + "rewards/margins": 3.441271608175054, + "rewards/rejected": -3.1493157606858473, + "step": 383 + }, + { + "epoch": 0.6646473388143661, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3906967.2, + "logits/rejected": -5356351.05882353, + "logps/chosen": -151.367041015625, + "logps/rejected": -228.42249253216912, + "loss": 0.2549, + "rewards/chosen": 0.0117604931195577, + "rewards/margins": 3.961161002224567, + "rewards/rejected": -3.949400509105009, + "step": 384 + }, + { + "epoch": 0.6663781912591952, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10104050.0, + "logits/rejected": -13819494.0, + "logps/chosen": -142.1748504638672, + "logps/rejected": -234.27272033691406, + "loss": 0.2891, + "rewards/chosen": -0.00037301331758499146, + "rewards/margins": 3.8429132625460625, + "rewards/rejected": -3.8432862758636475, + "step": 385 + }, + { + "epoch": 0.6681090437040242, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21388817.6, + "logits/rejected": -12234436.0, + "logps/chosen": -221.1366455078125, + "logps/rejected": -259.32480875651044, + "loss": 0.3225, + "rewards/chosen": 0.17770171165466309, + "rewards/margins": 4.403426885604858, + "rewards/rejected": -4.225725173950195, + "step": 386 + }, + { + "epoch": 0.6698398961488533, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 4042078.6666666665, + "logits/rejected": -8433456.0, + "logps/chosen": -174.63717447916667, + "logps/rejected": -238.44002757352942, + "loss": 0.2942, + "rewards/chosen": -0.2784555117289225, + "rewards/margins": 3.808582865023146, + "rewards/rejected": -4.087038376752068, + "step": 387 + }, + { + "epoch": 0.6715707485936824, + "grad_norm": 21.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5466052.235294118, + "logits/rejected": -19294852.266666666, + "logps/chosen": -247.90438304227942, + "logps/rejected": -286.71985677083336, + "loss": 0.2998, + "rewards/chosen": 0.060690725550932044, + "rewards/margins": 2.9108534859675985, + "rewards/rejected": -2.8501627604166666, + "step": 388 + }, + { + "epoch": 0.6733016010385114, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2184413.6842105263, + "logits/rejected": -8799113.846153846, + "logps/chosen": -165.29582134046052, + "logps/rejected": -266.46243990384613, + "loss": 0.2842, + "rewards/chosen": 0.1969998133809943, + "rewards/margins": 4.636947675272521, + "rewards/rejected": -4.439947861891526, + "step": 389 + }, + { + "epoch": 0.6750324534833405, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16991953.777777776, + "logits/rejected": -12042065.142857144, + "logps/chosen": -219.54055447048611, + "logps/rejected": -206.49086216517858, + "loss": 0.2935, + "rewards/chosen": 0.13866905371348062, + "rewards/margins": 4.394011809712365, + "rewards/rejected": -4.255342755998884, + "step": 390 + }, + { + "epoch": 0.6767633059281696, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17843486.11764706, + "logits/rejected": -15110476.8, + "logps/chosen": -209.19716509650735, + "logps/rejected": -260.31140950520836, + "loss": 0.2755, + "rewards/chosen": 0.2135008924147662, + "rewards/margins": 3.3063346021315634, + "rewards/rejected": -3.092833709716797, + "step": 391 + }, + { + "epoch": 0.6784941583729988, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13394701.47368421, + "logits/rejected": -28176009.846153848, + "logps/chosen": -197.44514545641448, + "logps/rejected": -286.12218299278845, + "loss": 0.3262, + "rewards/chosen": 0.0030214974754735045, + "rewards/margins": 4.019969317111892, + "rewards/rejected": -4.016947819636418, + "step": 392 + }, + { + "epoch": 0.6802250108178278, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10810348.307692308, + "logits/rejected": -11191107.368421054, + "logps/chosen": -195.60421048677884, + "logps/rejected": -282.5986842105263, + "loss": 0.232, + "rewards/chosen": 0.2605471427624042, + "rewards/margins": 3.74175233010821, + "rewards/rejected": -3.481205187345806, + "step": 393 + }, + { + "epoch": 0.6819558632626569, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11212326.222222222, + "logits/rejected": -20486288.0, + "logps/chosen": -207.88640679253473, + "logps/rejected": -419.60177176339283, + "loss": 0.3117, + "rewards/chosen": 0.024086718757947285, + "rewards/margins": 4.842109310485068, + "rewards/rejected": -4.818022591727121, + "step": 394 + }, + { + "epoch": 0.683686715707486, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21567934.0, + "logits/rejected": -24650774.0, + "logps/chosen": -268.11749267578125, + "logps/rejected": -356.9142761230469, + "loss": 0.2524, + "rewards/chosen": 0.27397921681404114, + "rewards/margins": 4.13662913441658, + "rewards/rejected": -3.862649917602539, + "step": 395 + }, + { + "epoch": 0.685417568152315, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17168701.53846154, + "logits/rejected": -13234468.210526315, + "logps/chosen": -194.9405235877404, + "logps/rejected": -247.79887952302633, + "loss": 0.2372, + "rewards/chosen": 0.1489866146674523, + "rewards/margins": 3.592986457260997, + "rewards/rejected": -3.4439998425935445, + "step": 396 + }, + { + "epoch": 0.6871484205971441, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8208964.571428572, + "logits/rejected": -4894575.555555556, + "logps/chosen": -210.4195556640625, + "logps/rejected": -230.52845594618054, + "loss": 0.2675, + "rewards/chosen": -0.12185014145714897, + "rewards/margins": 3.6312278953809587, + "rewards/rejected": -3.7530780368381076, + "step": 397 + }, + { + "epoch": 0.6888792730419732, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4660458.222222222, + "logits/rejected": -15927796.57142857, + "logps/chosen": -228.41726345486111, + "logps/rejected": -304.3857421875, + "loss": 0.25, + "rewards/chosen": 0.37790894508361816, + "rewards/margins": 5.155334983553205, + "rewards/rejected": -4.777426038469587, + "step": 398 + }, + { + "epoch": 0.6906101254868022, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17893813.714285713, + "logits/rejected": -15725480.888888888, + "logps/chosen": -238.42002650669642, + "logps/rejected": -288.1274685329861, + "loss": 0.2502, + "rewards/chosen": 0.19092067650386266, + "rewards/margins": 3.556104188873654, + "rewards/rejected": -3.3651835123697915, + "step": 399 + }, + { + "epoch": 0.6923409779316313, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 918696.1111111111, + "logits/rejected": -11522752.0, + "logps/chosen": -124.32207573784723, + "logps/rejected": -281.4178466796875, + "loss": 0.3106, + "rewards/chosen": -0.00982724130153656, + "rewards/margins": 3.6492314657994678, + "rewards/rejected": -3.6590587071010043, + "step": 400 + }, + { + "epoch": 0.6940718303764604, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12883313.454545455, + "logits/rejected": -4506007.6, + "logps/chosen": -207.15174449573863, + "logps/rejected": -214.768603515625, + "loss": 0.343, + "rewards/chosen": 0.15950042551214044, + "rewards/margins": 4.1166651899164375, + "rewards/rejected": -3.957164764404297, + "step": 401 + }, + { + "epoch": 0.6958026828212894, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24766320.0, + "logits/rejected": -19491955.2, + "logps/chosen": -244.98311360677084, + "logps/rejected": -363.9444091796875, + "loss": 0.2226, + "rewards/chosen": 0.12681794166564941, + "rewards/margins": 3.851882791519165, + "rewards/rejected": -3.7250648498535157, + "step": 402 + }, + { + "epoch": 0.6975335352661186, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18664481.454545453, + "logits/rejected": -7460914.285714285, + "logps/chosen": -282.8505193536932, + "logps/rejected": -294.13888113839283, + "loss": 0.2368, + "rewards/chosen": 0.09056963703849098, + "rewards/margins": 3.6263260660749497, + "rewards/rejected": -3.5357564290364585, + "step": 403 + }, + { + "epoch": 0.6992643877109477, + "grad_norm": 13.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18533613.333333332, + "logits/rejected": -13516030.4, + "logps/chosen": -212.40104166666666, + "logps/rejected": -241.956787109375, + "loss": 0.1849, + "rewards/chosen": 0.34865784645080566, + "rewards/margins": 4.211439752578736, + "rewards/rejected": -3.8627819061279296, + "step": 404 + }, + { + "epoch": 0.7009952401557767, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15473829.647058824, + "logits/rejected": -11342652.8, + "logps/chosen": -193.92112821691177, + "logps/rejected": -232.93429361979167, + "loss": 0.2789, + "rewards/chosen": 0.18107993462506464, + "rewards/margins": 3.401664295383528, + "rewards/rejected": -3.2205843607584637, + "step": 405 + }, + { + "epoch": 0.7027260926006058, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12117576.888888888, + "logits/rejected": -13038384.0, + "logps/chosen": -249.60536024305554, + "logps/rejected": -216.10743931361608, + "loss": 0.2775, + "rewards/chosen": 0.34258826573689777, + "rewards/margins": 3.9860533078511557, + "rewards/rejected": -3.643465042114258, + "step": 406 + }, + { + "epoch": 0.7044569450454349, + "grad_norm": 14.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8714836.666666666, + "logits/rejected": -22378428.8, + "logps/chosen": -136.01717122395834, + "logps/rejected": -348.047314453125, + "loss": 0.2208, + "rewards/chosen": 0.0772122045358022, + "rewards/margins": 3.7641903539498647, + "rewards/rejected": -3.6869781494140623, + "step": 407 + }, + { + "epoch": 0.706187797490264, + "grad_norm": 15.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5694276.307692308, + "logits/rejected": -8889362.52631579, + "logps/chosen": -173.24391526442307, + "logps/rejected": -158.02138157894737, + "loss": 0.2693, + "rewards/chosen": 0.18675978367145246, + "rewards/margins": 3.0151329397672586, + "rewards/rejected": -2.828373156095806, + "step": 408 + }, + { + "epoch": 0.707918649935093, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18351624.888888888, + "logits/rejected": 1312963.857142857, + "logps/chosen": -197.83175998263889, + "logps/rejected": -146.22428676060267, + "loss": 0.3214, + "rewards/chosen": 0.09451577398512098, + "rewards/margins": 3.0132324771275596, + "rewards/rejected": -2.918716703142439, + "step": 409 + }, + { + "epoch": 0.7096495023799221, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3099474.3529411764, + "logits/rejected": -24587089.066666666, + "logps/chosen": -254.81488396139707, + "logps/rejected": -329.778125, + "loss": 0.2655, + "rewards/chosen": 0.12878047718721278, + "rewards/margins": 4.448991760553098, + "rewards/rejected": -4.320211283365885, + "step": 410 + }, + { + "epoch": 0.7113803548247511, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9474645.333333334, + "logits/rejected": -16116789.714285715, + "logps/chosen": -177.89995659722223, + "logps/rejected": -219.65004185267858, + "loss": 0.3089, + "rewards/chosen": 0.07183490859137641, + "rewards/margins": 3.8493407832251654, + "rewards/rejected": -3.777505874633789, + "step": 411 + }, + { + "epoch": 0.7131112072695802, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15355790.0, + "logits/rejected": -24347414.0, + "logps/chosen": -270.8287658691406, + "logps/rejected": -315.73907470703125, + "loss": 0.2821, + "rewards/chosen": 0.22467941045761108, + "rewards/margins": 3.432914674282074, + "rewards/rejected": -3.208235263824463, + "step": 412 + }, + { + "epoch": 0.7148420597144094, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5671832.5, + "logits/rejected": -9062221.0, + "logps/chosen": -153.04049682617188, + "logps/rejected": -204.20892333984375, + "loss": 0.2814, + "rewards/chosen": 0.11974607408046722, + "rewards/margins": 3.502686843276024, + "rewards/rejected": -3.3829407691955566, + "step": 413 + }, + { + "epoch": 0.7165729121592385, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16363633.777777778, + "logits/rejected": -9754731.42857143, + "logps/chosen": -197.049072265625, + "logps/rejected": -190.04244559151786, + "loss": 0.2892, + "rewards/chosen": 0.2748352421654595, + "rewards/margins": 3.228357788116213, + "rewards/rejected": -2.9535225459507535, + "step": 414 + }, + { + "epoch": 0.7183037646040675, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17049558.588235293, + "logits/rejected": -8489429.333333334, + "logps/chosen": -248.37281709558823, + "logps/rejected": -240.55061848958334, + "loss": 0.2521, + "rewards/chosen": 0.34352756949032054, + "rewards/margins": 3.7916334451413625, + "rewards/rejected": -3.448105875651042, + "step": 415 + }, + { + "epoch": 0.7200346170488966, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 805961.875, + "logits/rejected": -15496698.0, + "logps/chosen": -184.46556091308594, + "logps/rejected": -246.5825653076172, + "loss": 0.2846, + "rewards/chosen": 0.10234836488962173, + "rewards/margins": 3.252162493765354, + "rewards/rejected": -3.1498141288757324, + "step": 416 + }, + { + "epoch": 0.7217654694937257, + "grad_norm": 21.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24636882.82352941, + "logits/rejected": -12409627.733333332, + "logps/chosen": -330.4458582261029, + "logps/rejected": -373.5914713541667, + "loss": 0.2594, + "rewards/chosen": 0.259279587689568, + "rewards/margins": 3.934955297731886, + "rewards/rejected": -3.675675710042318, + "step": 417 + }, + { + "epoch": 0.7234963219385547, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10155608.0, + "logits/rejected": -24280902.0, + "logps/chosen": -222.4298858642578, + "logps/rejected": -263.7016906738281, + "loss": 0.2568, + "rewards/chosen": 0.14977231621742249, + "rewards/margins": 4.1859427988529205, + "rewards/rejected": -4.036170482635498, + "step": 418 + }, + { + "epoch": 0.7252271743833838, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15192890.0, + "logits/rejected": 3364227.25, + "logps/chosen": -271.9737854003906, + "logps/rejected": -241.6377410888672, + "loss": 0.2643, + "rewards/chosen": 0.18620336055755615, + "rewards/margins": 4.176789402961731, + "rewards/rejected": -3.990586042404175, + "step": 419 + }, + { + "epoch": 0.7269580268282129, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13059384.615384616, + "logits/rejected": -13372461.47368421, + "logps/chosen": -309.49624399038464, + "logps/rejected": -296.7671412417763, + "loss": 0.1777, + "rewards/chosen": 0.6419920554527869, + "rewards/margins": 3.972150891415986, + "rewards/rejected": -3.330158835963199, + "step": 420 + }, + { + "epoch": 0.7286888792730419, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21641106.82352941, + "logits/rejected": -9575245.866666667, + "logps/chosen": -244.3195082720588, + "logps/rejected": -234.54254557291668, + "loss": 0.2751, + "rewards/chosen": 0.3425563643960392, + "rewards/margins": 3.0913252344318467, + "rewards/rejected": -2.7487688700358075, + "step": 421 + }, + { + "epoch": 0.730419731717871, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10915912.470588235, + "logits/rejected": -12489817.6, + "logps/chosen": -219.41150620404412, + "logps/rejected": -194.45133463541666, + "loss": 0.295, + "rewards/chosen": 0.25638793496524587, + "rewards/margins": 3.5416565913780063, + "rewards/rejected": -3.2852686564127604, + "step": 422 + }, + { + "epoch": 0.7321505841627002, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13010798.76923077, + "logits/rejected": -17895700.210526317, + "logps/chosen": -204.37747896634616, + "logps/rejected": -272.21029502467104, + "loss": 0.2213, + "rewards/chosen": 0.3280166479257437, + "rewards/margins": 4.0214932746732766, + "rewards/rejected": -3.693476626747533, + "step": 423 + }, + { + "epoch": 0.7338814366075292, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25096371.555555556, + "logits/rejected": -14827675.42857143, + "logps/chosen": -259.487060546875, + "logps/rejected": -255.24269321986608, + "loss": 0.2537, + "rewards/chosen": 0.6062867906358507, + "rewards/margins": 3.9723584916856556, + "rewards/rejected": -3.3660717010498047, + "step": 424 + }, + { + "epoch": 0.7356122890523583, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10886356.0, + "logits/rejected": -11246288.8, + "logps/chosen": -130.8861083984375, + "logps/rejected": -190.6217529296875, + "loss": 0.2564, + "rewards/chosen": 0.07656551897525787, + "rewards/margins": 3.1436734825372694, + "rewards/rejected": -3.0671079635620115, + "step": 425 + }, + { + "epoch": 0.7373431414971874, + "grad_norm": 21.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12971242.352941176, + "logits/rejected": -4865361.6, + "logps/chosen": -260.65133846507354, + "logps/rejected": -270.06396484375, + "loss": 0.2753, + "rewards/chosen": 0.2961828007417567, + "rewards/margins": 3.9448318892834235, + "rewards/rejected": -3.6486490885416667, + "step": 426 + }, + { + "epoch": 0.7390739939420164, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14953814.4, + "logits/rejected": -20692565.818181816, + "logps/chosen": -178.722509765625, + "logps/rejected": -320.9410511363636, + "loss": 0.1832, + "rewards/chosen": 0.3632230281829834, + "rewards/margins": 4.107506184144453, + "rewards/rejected": -3.74428315596147, + "step": 427 + }, + { + "epoch": 0.7408048463868455, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4080816.9411764704, + "logits/rejected": -13191356.8, + "logps/chosen": -106.93411075367646, + "logps/rejected": -236.608984375, + "loss": 0.3184, + "rewards/chosen": 0.005095874561982996, + "rewards/margins": 2.8842472824395875, + "rewards/rejected": -2.8791514078776044, + "step": 428 + }, + { + "epoch": 0.7425356988316746, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7458655.333333333, + "logits/rejected": -10363449.6, + "logps/chosen": -274.15008544921875, + "logps/rejected": -291.81923828125, + "loss": 0.1935, + "rewards/chosen": 0.3417823712031047, + "rewards/margins": 4.188549033800761, + "rewards/rejected": -3.8467666625976564, + "step": 429 + }, + { + "epoch": 0.7442665512765037, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6628209.882352941, + "logits/rejected": -21571532.8, + "logps/chosen": -209.14427274816177, + "logps/rejected": -308.23619791666664, + "loss": 0.2655, + "rewards/chosen": 0.21884497474221623, + "rewards/margins": 4.741412215139352, + "rewards/rejected": -4.5225672403971355, + "step": 430 + }, + { + "epoch": 0.7459974037213327, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24242556.23529412, + "logits/rejected": -24843059.2, + "logps/chosen": -208.53262867647058, + "logps/rejected": -442.03779296875, + "loss": 0.2635, + "rewards/chosen": 0.25568490869858684, + "rewards/margins": 5.138899223477233, + "rewards/rejected": -4.8832143147786455, + "step": 431 + }, + { + "epoch": 0.7477282561661618, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15604645.333333334, + "logits/rejected": -14124722.285714285, + "logps/chosen": -243.62122938368054, + "logps/rejected": -262.1013881138393, + "loss": 0.2918, + "rewards/chosen": -0.009077443016899956, + "rewards/margins": 3.659095938243563, + "rewards/rejected": -3.668173381260463, + "step": 432 + }, + { + "epoch": 0.749459108610991, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18218340.0, + "logits/rejected": -16595606.0, + "logps/chosen": -226.3037872314453, + "logps/rejected": -315.81903076171875, + "loss": 0.2522, + "rewards/chosen": 0.17157624661922455, + "rewards/margins": 4.432760462164879, + "rewards/rejected": -4.261184215545654, + "step": 433 + }, + { + "epoch": 0.75118996105582, + "grad_norm": 16.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5292061.866666666, + "logits/rejected": -14201923.764705881, + "logps/chosen": -185.165234375, + "logps/rejected": -215.77549115349265, + "loss": 0.2388, + "rewards/chosen": 0.17510838508605958, + "rewards/margins": 5.099181548286887, + "rewards/rejected": -4.924073163200827, + "step": 434 + }, + { + "epoch": 0.7529208135006491, + "grad_norm": 19.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3707326.933333333, + "logits/rejected": -6759980.235294118, + "logps/chosen": -192.10901692708333, + "logps/rejected": -224.3739803538603, + "loss": 0.2808, + "rewards/chosen": -0.1366729736328125, + "rewards/margins": 3.285342721378102, + "rewards/rejected": -3.4220156950109146, + "step": 435 + }, + { + "epoch": 0.7546516659454782, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14735996.0, + "logits/rejected": -19061198.0, + "logps/chosen": -161.44833374023438, + "logps/rejected": -348.6562194824219, + "loss": 0.2494, + "rewards/chosen": 0.11681022495031357, + "rewards/margins": 5.47369708865881, + "rewards/rejected": -5.356886863708496, + "step": 436 + }, + { + "epoch": 0.7563825183903072, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25264342.0, + "logits/rejected": -9538981.0, + "logps/chosen": -254.5364990234375, + "logps/rejected": -250.4523468017578, + "loss": 0.22, + "rewards/chosen": 0.5350126028060913, + "rewards/margins": 4.980099558830261, + "rewards/rejected": -4.44508695602417, + "step": 437 + }, + { + "epoch": 0.7581133708351363, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16868498.46153846, + "logits/rejected": -9602219.789473685, + "logps/chosen": -190.87798602764423, + "logps/rejected": -256.28628700657896, + "loss": 0.2165, + "rewards/chosen": 0.23992305535536546, + "rewards/margins": 3.8148316632398225, + "rewards/rejected": -3.574908607884457, + "step": 438 + }, + { + "epoch": 0.7598442232799654, + "grad_norm": 18.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8693454.933333334, + "logits/rejected": -13654310.588235294, + "logps/chosen": -154.370849609375, + "logps/rejected": -236.2052504595588, + "loss": 0.2975, + "rewards/chosen": 0.015996766090393067, + "rewards/margins": 3.337686147409327, + "rewards/rejected": -3.321689381318934, + "step": 439 + }, + { + "epoch": 0.7615750757247944, + "grad_norm": 24.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24929335.57894737, + "logits/rejected": -34953590.15384615, + "logps/chosen": -259.91873972039474, + "logps/rejected": -410.19200721153845, + "loss": 0.2905, + "rewards/chosen": 0.1934488949022795, + "rewards/margins": 4.283554685260603, + "rewards/rejected": -4.090105790358323, + "step": 440 + }, + { + "epoch": 0.7633059281696235, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19067797.818181816, + "logits/rejected": -15738675.80952381, + "logps/chosen": -250.85207297585228, + "logps/rejected": -244.77797154017858, + "loss": 0.1767, + "rewards/chosen": 0.40040709755637427, + "rewards/margins": 4.185635281847669, + "rewards/rejected": -3.7852281842912947, + "step": 441 + }, + { + "epoch": 0.7650367806144526, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18492139.789473683, + "logits/rejected": -8682475.076923076, + "logps/chosen": -166.6618009868421, + "logps/rejected": -261.5407151442308, + "loss": 0.3117, + "rewards/chosen": -0.005474517219945004, + "rewards/margins": 4.423705041167225, + "rewards/rejected": -4.42917955838717, + "step": 442 + }, + { + "epoch": 0.7667676330592818, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19419906.46153846, + "logits/rejected": -13736991.157894736, + "logps/chosen": -314.8329514723558, + "logps/rejected": -216.7960783305921, + "loss": 0.2461, + "rewards/chosen": 0.2657426137190599, + "rewards/margins": 3.5493117488830195, + "rewards/rejected": -3.2835691351639595, + "step": 443 + }, + { + "epoch": 0.7684984855041108, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4494598.571428572, + "logits/rejected": -2699613.5555555555, + "logps/chosen": -123.56604875837054, + "logps/rejected": -266.0947265625, + "loss": 0.2477, + "rewards/chosen": 0.12361056464059013, + "rewards/margins": 3.99099091878013, + "rewards/rejected": -3.8673803541395397, + "step": 444 + }, + { + "epoch": 0.7702293379489399, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6884477.333333333, + "logits/rejected": -1802673.2, + "logps/chosen": -116.48790486653645, + "logps/rejected": -189.2822021484375, + "loss": 0.2477, + "rewards/chosen": 0.05934780836105347, + "rewards/margins": 2.957999885082245, + "rewards/rejected": -2.8986520767211914, + "step": 445 + }, + { + "epoch": 0.771960190393769, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15380920.888888888, + "logits/rejected": -19985661.714285713, + "logps/chosen": -238.71042209201389, + "logps/rejected": -304.8035365513393, + "loss": 0.3298, + "rewards/chosen": 0.17239532205793592, + "rewards/margins": 2.771995432793148, + "rewards/rejected": -2.5996001107352122, + "step": 446 + }, + { + "epoch": 0.773691042838598, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10023824.94117647, + "logits/rejected": -12020683.733333332, + "logps/chosen": -181.765625, + "logps/rejected": -234.22122395833333, + "loss": 0.2793, + "rewards/chosen": 0.15763366923612707, + "rewards/margins": 4.021428631801231, + "rewards/rejected": -3.863794962565104, + "step": 447 + }, + { + "epoch": 0.7754218952834271, + "grad_norm": 22.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19391936.0, + "logits/rejected": -28420565.333333332, + "logps/chosen": -276.42052504595586, + "logps/rejected": -381.76031901041665, + "loss": 0.2465, + "rewards/chosen": 0.3492642290451947, + "rewards/margins": 4.498522651896757, + "rewards/rejected": -4.149258422851562, + "step": 448 + }, + { + "epoch": 0.7771527477282562, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13008866.133333333, + "logits/rejected": -11776538.352941176, + "logps/chosen": -193.7296875, + "logps/rejected": -233.73586856617646, + "loss": 0.2573, + "rewards/chosen": 0.18115135828653972, + "rewards/margins": 3.2769045231389065, + "rewards/rejected": -3.0957531648523666, + "step": 449 + }, + { + "epoch": 0.7788836001730852, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4430243.142857143, + "logits/rejected": -20104984.888888888, + "logps/chosen": -135.77996826171875, + "logps/rejected": -306.85772026909723, + "loss": 0.2556, + "rewards/chosen": -0.17125649111611502, + "rewards/margins": 4.2319531951631815, + "rewards/rejected": -4.403209686279297, + "step": 450 + }, + { + "epoch": 0.7806144526179143, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3794293.777777778, + "logits/rejected": -15451656.0, + "logps/chosen": -234.96622721354166, + "logps/rejected": -250.50319126674108, + "loss": 0.3017, + "rewards/chosen": 0.059784889221191406, + "rewards/margins": 3.7921803338187083, + "rewards/rejected": -3.732395444597517, + "step": 451 + }, + { + "epoch": 0.7823453050627434, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6830123.5, + "logits/rejected": -12880535.0, + "logps/chosen": -231.75677490234375, + "logps/rejected": -277.2532958984375, + "loss": 0.2417, + "rewards/chosen": 0.2947371006011963, + "rewards/margins": 4.360699892044067, + "rewards/rejected": -4.065962791442871, + "step": 452 + }, + { + "epoch": 0.7840761575075724, + "grad_norm": 23.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 17627772.0, + "logits/rejected": -14619754.0, + "logps/chosen": -329.4761962890625, + "logps/rejected": -283.1095275878906, + "loss": 0.2665, + "rewards/chosen": 0.36727145314216614, + "rewards/margins": 3.082612544298172, + "rewards/rejected": -2.715341091156006, + "step": 453 + }, + { + "epoch": 0.7858070099524016, + "grad_norm": 18.0, + "kl": 0.11761283874511719, + "learning_rate": 5e-06, + "logits/chosen": -9774634.666666666, + "logits/rejected": -21158379.42857143, + "logps/chosen": -161.02365451388889, + "logps/rejected": -277.0223388671875, + "loss": 0.3392, + "rewards/chosen": -0.040952947404649526, + "rewards/margins": 3.27204868528578, + "rewards/rejected": -3.3130016326904297, + "step": 454 + }, + { + "epoch": 0.7875378623972307, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20624202.10526316, + "logits/rejected": -15552576.0, + "logps/chosen": -241.53258634868422, + "logps/rejected": -283.8398249699519, + "loss": 0.3202, + "rewards/chosen": -0.01610117523293746, + "rewards/margins": 4.267229595406335, + "rewards/rejected": -4.283330770639273, + "step": 455 + }, + { + "epoch": 0.7892687148420597, + "grad_norm": 16.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15570027.789473685, + "logits/rejected": -14045777.23076923, + "logps/chosen": -134.73129111842104, + "logps/rejected": -211.42063551682693, + "loss": 0.3085, + "rewards/chosen": 0.09267089241429378, + "rewards/margins": 3.651899936228146, + "rewards/rejected": -3.5592290438138523, + "step": 456 + }, + { + "epoch": 0.7909995672868888, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12399494.736842105, + "logits/rejected": -21610971.076923076, + "logps/chosen": -181.78718647203948, + "logps/rejected": -268.7252666766827, + "loss": 0.3074, + "rewards/chosen": 0.1555995690195184, + "rewards/margins": 4.185192816653232, + "rewards/rejected": -4.029593247633714, + "step": 457 + }, + { + "epoch": 0.7927304197317179, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23126056.533333335, + "logits/rejected": -24268367.05882353, + "logps/chosen": -196.58671875, + "logps/rejected": -251.80411305147058, + "loss": 0.2311, + "rewards/chosen": 0.4466190020243327, + "rewards/margins": 4.468565628575344, + "rewards/rejected": -4.021946626551011, + "step": 458 + }, + { + "epoch": 0.7944612721765469, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -2661574.0, + "logits/rejected": -982190.0, + "logps/chosen": -102.0928726196289, + "logps/rejected": -225.1731414794922, + "loss": 0.3108, + "rewards/chosen": -0.19069679081439972, + "rewards/margins": 2.8766087740659714, + "rewards/rejected": -3.067305564880371, + "step": 459 + }, + { + "epoch": 0.796192124621376, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15123262.222222222, + "logits/rejected": -23827974.85714286, + "logps/chosen": -204.50777180989584, + "logps/rejected": -322.21812220982144, + "loss": 0.3165, + "rewards/chosen": 0.11603225602044, + "rewards/margins": 3.537644041909112, + "rewards/rejected": -3.421611785888672, + "step": 460 + }, + { + "epoch": 0.7979229770662051, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13073977.6, + "logits/rejected": -17592845.333333332, + "logps/chosen": -150.07423095703126, + "logps/rejected": -290.4130859375, + "loss": 0.31, + "rewards/chosen": 0.1515453577041626, + "rewards/margins": 4.387944372495015, + "rewards/rejected": -4.236399014790853, + "step": 461 + }, + { + "epoch": 0.7996538295110341, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10161326.933333334, + "logits/rejected": -12855152.94117647, + "logps/chosen": -175.26847330729166, + "logps/rejected": -211.18441233915442, + "loss": 0.2629, + "rewards/chosen": 0.14281096458435058, + "rewards/margins": 3.5822302060968734, + "rewards/rejected": -3.439419241512523, + "step": 462 + }, + { + "epoch": 0.8013846819558632, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7623167.5, + "logits/rejected": -20095272.0, + "logps/chosen": -129.42367553710938, + "logps/rejected": -185.14320373535156, + "loss": 0.2777, + "rewards/chosen": 0.11407812684774399, + "rewards/margins": 3.284298501908779, + "rewards/rejected": -3.170220375061035, + "step": 463 + }, + { + "epoch": 0.8031155344006924, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21579841.333333332, + "logits/rejected": -8914736.0, + "logps/chosen": -177.9075724283854, + "logps/rejected": -242.04560546875, + "loss": 0.2254, + "rewards/chosen": 0.2258871595064799, + "rewards/margins": 3.4008392850557962, + "rewards/rejected": -3.1749521255493165, + "step": 464 + }, + { + "epoch": 0.8048463868455215, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18920400.0, + "logits/rejected": -18218436.57142857, + "logps/chosen": -225.89796278211804, + "logps/rejected": -262.46365792410717, + "loss": 0.2714, + "rewards/chosen": 0.5581624242994521, + "rewards/margins": 3.1868504191201827, + "rewards/rejected": -2.628687994820731, + "step": 465 + }, + { + "epoch": 0.8065772392903505, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11584106.666666666, + "logits/rejected": -26576393.6, + "logps/chosen": -176.5497029622396, + "logps/rejected": -309.907373046875, + "loss": 0.2317, + "rewards/chosen": 0.07617552081743877, + "rewards/margins": 4.119338820377986, + "rewards/rejected": -4.043163299560547, + "step": 466 + }, + { + "epoch": 0.8083080917351796, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12127784.0, + "logits/rejected": -25601744.0, + "logps/chosen": -242.9932657877604, + "logps/rejected": -254.711865234375, + "loss": 0.1971, + "rewards/chosen": 0.3493557373682658, + "rewards/margins": 4.2270679871241255, + "rewards/rejected": -3.8777122497558594, + "step": 467 + }, + { + "epoch": 0.8100389441800087, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -30178465.684210528, + "logits/rejected": -17077396.923076924, + "logps/chosen": -268.56396484375, + "logps/rejected": -266.34029447115387, + "loss": 0.3152, + "rewards/chosen": 0.2981703657852976, + "rewards/margins": 3.7815934520983983, + "rewards/rejected": -3.483423086313101, + "step": 468 + }, + { + "epoch": 0.8117697966248377, + "grad_norm": 21.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13815729.066666666, + "logits/rejected": -12947830.588235294, + "logps/chosen": -154.47093098958334, + "logps/rejected": -212.81844554227942, + "loss": 0.3014, + "rewards/chosen": -0.02881938616434733, + "rewards/margins": 2.704724907407573, + "rewards/rejected": -2.7335442935719207, + "step": 469 + }, + { + "epoch": 0.8135006490696668, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7885804.923076923, + "logits/rejected": -14350639.157894736, + "logps/chosen": -180.69873046875, + "logps/rejected": -211.95596474095396, + "loss": 0.2547, + "rewards/chosen": -0.020997260625545796, + "rewards/margins": 3.4841915307498654, + "rewards/rejected": -3.5051887913754114, + "step": 470 + }, + { + "epoch": 0.8152315015144959, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11280840.0, + "logits/rejected": 3108305.5, + "logps/chosen": -200.45086669921875, + "logps/rejected": -306.38677978515625, + "loss": 0.2812, + "rewards/chosen": 0.11567538976669312, + "rewards/margins": 4.233846604824066, + "rewards/rejected": -4.118171215057373, + "step": 471 + }, + { + "epoch": 0.8169623539593249, + "grad_norm": 23.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -300886.5882352941, + "logits/rejected": -23357789.866666667, + "logps/chosen": -191.05316521139707, + "logps/rejected": -292.22255859375, + "loss": 0.2613, + "rewards/chosen": 0.1551659107208252, + "rewards/margins": 4.588977352778117, + "rewards/rejected": -4.433811442057292, + "step": 472 + }, + { + "epoch": 0.818693206404154, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18912087.466666665, + "logits/rejected": -1586782.9411764706, + "logps/chosen": -183.99689127604168, + "logps/rejected": -231.25235523897058, + "loss": 0.2744, + "rewards/chosen": 0.12579172452290852, + "rewards/margins": 3.1985963438071456, + "rewards/rejected": -3.0728046192842373, + "step": 473 + }, + { + "epoch": 0.8204240588489832, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12369269.647058824, + "logits/rejected": -11282356.266666668, + "logps/chosen": -168.07036994485293, + "logps/rejected": -267.8197265625, + "loss": 0.3046, + "rewards/chosen": 0.11311096303603228, + "rewards/margins": 4.352031810610902, + "rewards/rejected": -4.23892084757487, + "step": 474 + }, + { + "epoch": 0.8221549112938122, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5711659.5, + "logits/rejected": -12414589.0, + "logps/chosen": -218.7265625, + "logps/rejected": -202.72085571289062, + "loss": 0.2833, + "rewards/chosen": 0.18017937242984772, + "rewards/margins": 3.567504897713661, + "rewards/rejected": -3.3873255252838135, + "step": 475 + }, + { + "epoch": 0.8238857637386413, + "grad_norm": 15.6875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8730379.333333334, + "logits/rejected": -13519862.4, + "logps/chosen": -176.51920572916666, + "logps/rejected": -270.959375, + "loss": 0.2455, + "rewards/chosen": -0.21841418743133545, + "rewards/margins": 3.769347882270813, + "rewards/rejected": -3.9877620697021485, + "step": 476 + }, + { + "epoch": 0.8256166161834704, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8386278.153846154, + "logits/rejected": -19970396.63157895, + "logps/chosen": -141.63780799278845, + "logps/rejected": -301.8206722861842, + "loss": 0.2441, + "rewards/chosen": -0.003226254995052631, + "rewards/margins": 4.096735678341708, + "rewards/rejected": -4.09996193333676, + "step": 477 + }, + { + "epoch": 0.8273474686282994, + "grad_norm": 22.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10988549.05263158, + "logits/rejected": -26466306.46153846, + "logps/chosen": -216.68510999177633, + "logps/rejected": -282.0079815204327, + "loss": 0.2822, + "rewards/chosen": 0.37774211481997844, + "rewards/margins": 4.3997006860339205, + "rewards/rejected": -4.0219585712139425, + "step": 478 + }, + { + "epoch": 0.8290783210731285, + "grad_norm": 22.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23037214.0, + "logits/rejected": -28652462.0, + "logps/chosen": -309.24951171875, + "logps/rejected": -286.2002868652344, + "loss": 0.2509, + "rewards/chosen": 0.3773133158683777, + "rewards/margins": 4.476307570934296, + "rewards/rejected": -4.098994255065918, + "step": 479 + }, + { + "epoch": 0.8308091735179576, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19562078.666666668, + "logits/rejected": -25442158.4, + "logps/chosen": -217.12479654947916, + "logps/rejected": -336.8718505859375, + "loss": 0.1991, + "rewards/chosen": 0.20399622122446695, + "rewards/margins": 4.110249654452006, + "rewards/rejected": -3.906253433227539, + "step": 480 + }, + { + "epoch": 0.8325400259627866, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9583742.0, + "logits/rejected": -15564312.0, + "logps/chosen": -137.34060668945312, + "logps/rejected": -217.027587890625, + "loss": 0.2937, + "rewards/chosen": 0.07569746673107147, + "rewards/margins": 3.1968405693769455, + "rewards/rejected": -3.121143102645874, + "step": 481 + }, + { + "epoch": 0.8342708784076157, + "grad_norm": 17.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14621521.066666666, + "logits/rejected": -12492407.529411765, + "logps/chosen": -227.814599609375, + "logps/rejected": -251.48293887867646, + "loss": 0.2558, + "rewards/chosen": 0.3430078824361165, + "rewards/margins": 3.8711713435603126, + "rewards/rejected": -3.528163461124196, + "step": 482 + }, + { + "epoch": 0.8360017308524448, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13449909.05263158, + "logits/rejected": -10140061.538461538, + "logps/chosen": -260.20137746710526, + "logps/rejected": -249.39008037860577, + "loss": 0.2771, + "rewards/chosen": 0.4031766088385331, + "rewards/margins": 3.855549820038954, + "rewards/rejected": -3.452373211200421, + "step": 483 + }, + { + "epoch": 0.837732583297274, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22258684.0, + "logits/rejected": -17762546.0, + "logps/chosen": -163.50225830078125, + "logps/rejected": -262.4004211425781, + "loss": 0.2686, + "rewards/chosen": 0.27319496870040894, + "rewards/margins": 3.7916306853294373, + "rewards/rejected": -3.5184357166290283, + "step": 484 + }, + { + "epoch": 0.839463435742103, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18354068.57142857, + "logits/rejected": 33215964.444444444, + "logps/chosen": -251.75802176339286, + "logps/rejected": -245.85698784722223, + "loss": 0.2159, + "rewards/chosen": 0.5656452178955078, + "rewards/margins": 4.419897503323025, + "rewards/rejected": -3.8542522854275174, + "step": 485 + }, + { + "epoch": 0.8411942881869321, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23547254.85714286, + "logits/rejected": -17003984.0, + "logps/chosen": -196.16643415178572, + "logps/rejected": -243.74088541666666, + "loss": 0.2379, + "rewards/chosen": 0.40252753666469027, + "rewards/margins": 3.8222925852215477, + "rewards/rejected": -3.4197650485568576, + "step": 486 + }, + { + "epoch": 0.8429251406317612, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24897616.0, + "logits/rejected": -2443575.5555555555, + "logps/chosen": -195.60096958705358, + "logps/rejected": -183.46891276041666, + "loss": 0.2134, + "rewards/chosen": 0.2992358888898577, + "rewards/margins": 4.218667787218851, + "rewards/rejected": -3.919431898328993, + "step": 487 + }, + { + "epoch": 0.8446559930765902, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13898625.066666666, + "logits/rejected": -17252240.94117647, + "logps/chosen": -201.499169921875, + "logps/rejected": -183.95680147058823, + "loss": 0.2776, + "rewards/chosen": 0.07838481267293294, + "rewards/margins": 3.2409737175586173, + "rewards/rejected": -3.1625889048856846, + "step": 488 + }, + { + "epoch": 0.8463868455214193, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14268615.272727273, + "logits/rejected": -3743369.523809524, + "logps/chosen": -147.52658913352272, + "logps/rejected": -216.69612630208334, + "loss": 0.2372, + "rewards/chosen": -0.1216210126876831, + "rewards/margins": 3.9766470420928233, + "rewards/rejected": -4.098268054780506, + "step": 489 + }, + { + "epoch": 0.8481176979662484, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19717998.933333334, + "logits/rejected": -7990167.529411765, + "logps/chosen": -196.3513671875, + "logps/rejected": -246.92520680147058, + "loss": 0.2343, + "rewards/chosen": 0.4526223182678223, + "rewards/margins": 4.201772987141329, + "rewards/rejected": -3.7491506688735066, + "step": 490 + }, + { + "epoch": 0.8498485504110774, + "grad_norm": 15.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9592658.823529411, + "logits/rejected": -29287362.133333333, + "logps/chosen": -187.97726619944854, + "logps/rejected": -290.8776041666667, + "loss": 0.2586, + "rewards/chosen": 0.3004568885354435, + "rewards/margins": 4.371686441758099, + "rewards/rejected": -4.071229553222656, + "step": 491 + }, + { + "epoch": 0.8515794028559065, + "grad_norm": 27.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6058852.571428572, + "logits/rejected": -25398426.181818184, + "logps/chosen": -253.44275483630952, + "logps/rejected": -228.32004616477272, + "loss": 0.343, + "rewards/chosen": 0.15320293108622232, + "rewards/margins": 3.436665108709624, + "rewards/rejected": -3.283462177623402, + "step": 492 + }, + { + "epoch": 0.8533102553007356, + "grad_norm": 19.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17457157.333333332, + "logits/rejected": -9991496.0, + "logps/chosen": -211.65770128038196, + "logps/rejected": -186.29868861607142, + "loss": 0.2787, + "rewards/chosen": 0.27119795481363934, + "rewards/margins": 3.3397129149664013, + "rewards/rejected": -3.068514960152762, + "step": 493 + }, + { + "epoch": 0.8550411077455647, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27666933.333333332, + "logits/rejected": -35043590.4, + "logps/chosen": -295.1957194010417, + "logps/rejected": -367.8419921875, + "loss": 0.2316, + "rewards/chosen": 0.13595259189605713, + "rewards/margins": 4.060046219825745, + "rewards/rejected": -3.9240936279296874, + "step": 494 + }, + { + "epoch": 0.8567719601903938, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23522007.57894737, + "logits/rejected": -13348846.76923077, + "logps/chosen": -195.12703022203948, + "logps/rejected": -253.92705829326923, + "loss": 0.2889, + "rewards/chosen": 0.1074093266537315, + "rewards/margins": 5.130683674986063, + "rewards/rejected": -5.023274348332332, + "step": 495 + }, + { + "epoch": 0.8585028126352229, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15495670.260869564, + "logits/rejected": -32671548.444444444, + "logps/chosen": -186.5610988451087, + "logps/rejected": -381.050537109375, + "loss": 0.3368, + "rewards/chosen": 0.2973675520523735, + "rewards/margins": 4.46600217865285, + "rewards/rejected": -4.168634626600477, + "step": 496 + }, + { + "epoch": 0.860233665080052, + "grad_norm": 15.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14984768.0, + "logits/rejected": -13073856.0, + "logps/chosen": -236.09443803267047, + "logps/rejected": -207.42699032738096, + "loss": 0.2293, + "rewards/chosen": 0.08587663823908026, + "rewards/margins": 3.5210474930800397, + "rewards/rejected": -3.4351708548409596, + "step": 497 + }, + { + "epoch": 0.861964517524881, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 2496555.777777778, + "logits/rejected": -18931795.42857143, + "logps/chosen": -146.79703776041666, + "logps/rejected": -230.86087472098214, + "loss": 0.3436, + "rewards/chosen": 0.054005821545918785, + "rewards/margins": 2.4009340093249367, + "rewards/rejected": -2.346928187779018, + "step": 498 + }, + { + "epoch": 0.8636953699697101, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18158224.0, + "logits/rejected": -11236415.111111112, + "logps/chosen": -224.5089111328125, + "logps/rejected": -189.30841742621527, + "loss": 0.2549, + "rewards/chosen": 0.43646141460963656, + "rewards/margins": 3.5593622714754134, + "rewards/rejected": -3.122900856865777, + "step": 499 + }, + { + "epoch": 0.8654262224145391, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8951228.0, + "logits/rejected": -2373882.8, + "logps/chosen": -174.3299560546875, + "logps/rejected": -213.3728759765625, + "loss": 0.252, + "rewards/chosen": 0.11569970846176147, + "rewards/margins": 3.183331620693207, + "rewards/rejected": -3.0676319122314455, + "step": 500 + }, + { + "epoch": 0.8671570748593682, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18066339.36842105, + "logits/rejected": -13361725.538461538, + "logps/chosen": -214.2649054276316, + "logps/rejected": -191.958740234375, + "loss": 0.2845, + "rewards/chosen": 0.2815879269650108, + "rewards/margins": 4.328109627310563, + "rewards/rejected": -4.0465217003455525, + "step": 501 + }, + { + "epoch": 0.8688879273041973, + "grad_norm": 17.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13234214.857142856, + "logits/rejected": -12058996.444444444, + "logps/chosen": -210.45964704241072, + "logps/rejected": -243.24937608506946, + "loss": 0.2548, + "rewards/chosen": 0.16222643852233887, + "rewards/margins": 4.25296155611674, + "rewards/rejected": -4.090735117594401, + "step": 502 + }, + { + "epoch": 0.8706187797490264, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20217832.533333335, + "logits/rejected": -24252370.82352941, + "logps/chosen": -223.17744140625, + "logps/rejected": -314.5954159007353, + "loss": 0.2316, + "rewards/chosen": 0.40397138595581056, + "rewards/margins": 4.23303928936229, + "rewards/rejected": -3.8290679034064796, + "step": 503 + }, + { + "epoch": 0.8723496321938555, + "grad_norm": 19.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13596191.0, + "logits/rejected": -16392429.0, + "logps/chosen": -176.45895385742188, + "logps/rejected": -263.8536682128906, + "loss": 0.2481, + "rewards/chosen": 0.2699531316757202, + "rewards/margins": 5.313177943229675, + "rewards/rejected": -5.043224811553955, + "step": 504 + }, + { + "epoch": 0.8740804846386846, + "grad_norm": 17.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16117347.764705881, + "logits/rejected": -27767232.0, + "logps/chosen": -137.609130859375, + "logps/rejected": -336.7232421875, + "loss": 0.2598, + "rewards/chosen": 0.11807493602528292, + "rewards/margins": 4.784612411611221, + "rewards/rejected": -4.666537475585938, + "step": 505 + }, + { + "epoch": 0.8758113370835137, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20314384.0, + "logits/rejected": -29525336.0, + "logps/chosen": -164.17735290527344, + "logps/rejected": -359.05975341796875, + "loss": 0.2476, + "rewards/chosen": 0.27376165986061096, + "rewards/margins": 5.643193155527115, + "rewards/rejected": -5.369431495666504, + "step": 506 + }, + { + "epoch": 0.8775421895283427, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11319587.368421054, + "logits/rejected": -12618192.0, + "logps/chosen": -202.06530119243422, + "logps/rejected": -291.31689453125, + "loss": 0.3081, + "rewards/chosen": 0.11066823256643195, + "rewards/margins": 3.9772089946607827, + "rewards/rejected": -3.866540762094351, + "step": 507 + }, + { + "epoch": 0.8792730419731718, + "grad_norm": 19.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25144013.17647059, + "logits/rejected": -1785538.0, + "logps/chosen": -244.95211971507354, + "logps/rejected": -122.10411783854167, + "loss": 0.2834, + "rewards/chosen": 0.383547025568345, + "rewards/margins": 3.038481209324855, + "rewards/rejected": -2.6549341837565104, + "step": 508 + }, + { + "epoch": 0.8810038944180009, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24891217.066666666, + "logits/rejected": -13296007.529411765, + "logps/chosen": -320.0322265625, + "logps/rejected": -299.4142635569853, + "loss": 0.2459, + "rewards/chosen": 0.3699246088663737, + "rewards/margins": 5.3460441551956475, + "rewards/rejected": -4.976119546329274, + "step": 509 + }, + { + "epoch": 0.8827347468628299, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5489797.714285715, + "logits/rejected": -10060837.333333334, + "logps/chosen": -167.43985421316964, + "logps/rejected": -196.89461263020834, + "loss": 0.2576, + "rewards/chosen": 0.23594137600490026, + "rewards/margins": 3.17435330057901, + "rewards/rejected": -2.93841192457411, + "step": 510 + }, + { + "epoch": 0.884465599307659, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11461435.555555556, + "logits/rejected": -24541760.0, + "logps/chosen": -195.87803819444446, + "logps/rejected": -274.71878487723217, + "loss": 0.3231, + "rewards/chosen": -0.06326669454574585, + "rewards/margins": 3.601636128766196, + "rewards/rejected": -3.664902823311942, + "step": 511 + }, + { + "epoch": 0.8861964517524881, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21993728.0, + "logits/rejected": -33897390.93333333, + "logps/chosen": -196.04171932444854, + "logps/rejected": -349.8721028645833, + "loss": 0.2668, + "rewards/chosen": 0.11794144967023064, + "rewards/margins": 5.360028241662419, + "rewards/rejected": -5.242086791992188, + "step": 512 + }, + { + "epoch": 0.8879273041973171, + "grad_norm": 17.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16249709.538461538, + "logits/rejected": -32942618.94736842, + "logps/chosen": -219.71589543269232, + "logps/rejected": -361.1356650904605, + "loss": 0.219, + "rewards/chosen": 0.224733646099384, + "rewards/margins": 4.102197940532978, + "rewards/rejected": -3.8774642944335938, + "step": 513 + }, + { + "epoch": 0.8896581566421462, + "grad_norm": 19.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3928884.2352941176, + "logits/rejected": -26532153.6, + "logps/chosen": -165.3208438648897, + "logps/rejected": -333.09290364583336, + "loss": 0.2888, + "rewards/chosen": 0.009334767566007726, + "rewards/margins": 5.0305597431519455, + "rewards/rejected": -5.0212249755859375, + "step": 514 + }, + { + "epoch": 0.8913890090869754, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22323720.0, + "logits/rejected": -22426698.0, + "logps/chosen": -280.41650390625, + "logps/rejected": -330.9407043457031, + "loss": 0.2395, + "rewards/chosen": 0.32796403765678406, + "rewards/margins": 4.675223082304001, + "rewards/rejected": -4.347259044647217, + "step": 515 + }, + { + "epoch": 0.8931198615318044, + "grad_norm": 16.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7889323.733333333, + "logits/rejected": -17650004.70588235, + "logps/chosen": -166.89734700520833, + "logps/rejected": -290.56959443933823, + "loss": 0.2574, + "rewards/chosen": -0.04637970527013143, + "rewards/margins": 4.045418067305696, + "rewards/rejected": -4.091797772575827, + "step": 516 + }, + { + "epoch": 0.8948507139766335, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4329500.8, + "logits/rejected": -5154179.666666667, + "logps/chosen": -123.65411376953125, + "logps/rejected": -209.98917643229166, + "loss": 0.3525, + "rewards/chosen": -0.03697666525840759, + "rewards/margins": 2.757465829451879, + "rewards/rejected": -2.7944424947102866, + "step": 517 + }, + { + "epoch": 0.8965815664214626, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -18556625.066666666, + "logits/rejected": -8416498.823529411, + "logps/chosen": -243.98805338541666, + "logps/rejected": -229.21148322610293, + "loss": 0.2347, + "rewards/chosen": 0.3009837468465169, + "rewards/margins": 4.4175200555838785, + "rewards/rejected": -4.116536308737362, + "step": 518 + }, + { + "epoch": 0.8983124188662917, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1786538.0, + "logits/rejected": -9711140.0, + "logps/chosen": -218.73460388183594, + "logps/rejected": -291.7303771972656, + "loss": 0.2461, + "rewards/chosen": 0.3358926475048065, + "rewards/margins": 4.369432896375656, + "rewards/rejected": -4.03354024887085, + "step": 519 + }, + { + "epoch": 0.9000432713111207, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -32728876.0, + "logits/rejected": -23911114.0, + "logps/chosen": -247.1571502685547, + "logps/rejected": -295.3269348144531, + "loss": 0.2546, + "rewards/chosen": 0.26394540071487427, + "rewards/margins": 4.421155750751495, + "rewards/rejected": -4.157210350036621, + "step": 520 + }, + { + "epoch": 0.9017741237559498, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19820133.333333332, + "logits/rejected": -9890225.142857144, + "logps/chosen": -278.80579969618054, + "logps/rejected": -152.58932059151786, + "loss": 0.2357, + "rewards/chosen": 0.6421411832173666, + "rewards/margins": 3.4307761646452404, + "rewards/rejected": -2.7886349814278737, + "step": 521 + }, + { + "epoch": 0.9035049762007789, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -25119977.846153848, + "logits/rejected": -9313007.157894736, + "logps/chosen": -302.3107346754808, + "logps/rejected": -223.4919562088816, + "loss": 0.2182, + "rewards/chosen": 0.41573304396409255, + "rewards/margins": 4.431605798512818, + "rewards/rejected": -4.015872754548726, + "step": 522 + }, + { + "epoch": 0.9052358286456079, + "grad_norm": 16.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10601564.235294119, + "logits/rejected": -21510574.933333334, + "logps/chosen": -144.97162224264707, + "logps/rejected": -407.3462239583333, + "loss": 0.2763, + "rewards/chosen": 0.06784362652722527, + "rewards/margins": 4.457945758688684, + "rewards/rejected": -4.390102132161458, + "step": 523 + }, + { + "epoch": 0.906966681090437, + "grad_norm": 20.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16169753.0, + "logits/rejected": -9352516.0, + "logps/chosen": -259.9981689453125, + "logps/rejected": -200.53150939941406, + "loss": 0.2423, + "rewards/chosen": 0.5968568325042725, + "rewards/margins": 3.293236017227173, + "rewards/rejected": -2.6963791847229004, + "step": 524 + }, + { + "epoch": 0.9086975335352662, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21828404.0, + "logits/rejected": -10200844.0, + "logps/chosen": -254.7109375, + "logps/rejected": -227.15557861328125, + "loss": 0.2457, + "rewards/chosen": 0.578547477722168, + "rewards/margins": 3.6059441566467285, + "rewards/rejected": -3.0273966789245605, + "step": 525 + }, + { + "epoch": 0.9104283859800952, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13355686.4, + "logits/rejected": -23458064.94117647, + "logps/chosen": -150.8703125, + "logps/rejected": -284.8030215992647, + "loss": 0.255, + "rewards/chosen": 0.04080080986022949, + "rewards/margins": 4.726873215507059, + "rewards/rejected": -4.686072405646829, + "step": 526 + }, + { + "epoch": 0.9121592384249243, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -6665835.733333333, + "logits/rejected": -2375200.470588235, + "logps/chosen": -188.54013671875, + "logps/rejected": -265.4482421875, + "loss": 0.2391, + "rewards/chosen": 0.4144292195638021, + "rewards/margins": 4.037332706825406, + "rewards/rejected": -3.622903487261604, + "step": 527 + }, + { + "epoch": 0.9138900908697534, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14207008.0, + "logits/rejected": -8045904.5, + "logps/chosen": -167.97943115234375, + "logps/rejected": -232.38819885253906, + "loss": 0.2789, + "rewards/chosen": -0.028383783996105194, + "rewards/margins": 4.072894521057606, + "rewards/rejected": -4.101278305053711, + "step": 528 + }, + { + "epoch": 0.9156209433145824, + "grad_norm": 20.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -1836523.3333333333, + "logits/rejected": -1962385.7142857143, + "logps/chosen": -230.32972547743054, + "logps/rejected": -202.44597516741072, + "loss": 0.348, + "rewards/chosen": -0.09651472171147664, + "rewards/margins": 2.3565004695029486, + "rewards/rejected": -2.4530151912144254, + "step": 529 + }, + { + "epoch": 0.9173517957594115, + "grad_norm": 16.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4225653.066666666, + "logits/rejected": -18397138.82352941, + "logps/chosen": -141.47849934895834, + "logps/rejected": -282.91785386029414, + "loss": 0.2529, + "rewards/chosen": 0.06906947294871012, + "rewards/margins": 3.6899047211104747, + "rewards/rejected": -3.6208352481617645, + "step": 530 + }, + { + "epoch": 0.9190826482042406, + "grad_norm": 22.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26769754.94736842, + "logits/rejected": -4643756.0, + "logps/chosen": -321.6583830180921, + "logps/rejected": -281.6540339543269, + "loss": 0.3001, + "rewards/chosen": 0.2437381995351691, + "rewards/margins": 3.960281837324382, + "rewards/rejected": -3.7165436377892127, + "step": 531 + }, + { + "epoch": 0.9208135006490696, + "grad_norm": 16.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11062548.57142857, + "logits/rejected": -19331664.0, + "logps/chosen": -176.93929617745536, + "logps/rejected": -324.3730197482639, + "loss": 0.2192, + "rewards/chosen": 0.31770481382097515, + "rewards/margins": 4.732227620624361, + "rewards/rejected": -4.414522806803386, + "step": 532 + }, + { + "epoch": 0.9225443530938987, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17751121.777777776, + "logits/rejected": -14589137.142857144, + "logps/chosen": -174.75743272569446, + "logps/rejected": -187.13021414620536, + "loss": 0.3185, + "rewards/chosen": 0.009121828609042697, + "rewards/margins": 3.975349496281336, + "rewards/rejected": -3.9662276676722934, + "step": 533 + }, + { + "epoch": 0.9242752055387278, + "grad_norm": 14.4375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11708979.692307692, + "logits/rejected": -30101049.263157893, + "logps/chosen": -138.60321514423077, + "logps/rejected": -393.4633275082237, + "loss": 0.1981, + "rewards/chosen": 0.3644552597632775, + "rewards/margins": 4.962040034382932, + "rewards/rejected": -4.597584774619655, + "step": 534 + }, + { + "epoch": 0.926006057983557, + "grad_norm": 15.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7570177.6, + "logits/rejected": -37518098.823529415, + "logps/chosen": -124.16178385416667, + "logps/rejected": -391.8466796875, + "loss": 0.2543, + "rewards/chosen": 0.27270851135253904, + "rewards/margins": 4.353522199742934, + "rewards/rejected": -4.080813688390395, + "step": 535 + }, + { + "epoch": 0.927736910428386, + "grad_norm": 18.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -9234987.294117646, + "logits/rejected": -14866188.8, + "logps/chosen": -159.22031537224265, + "logps/rejected": -223.50963541666667, + "loss": 0.2958, + "rewards/chosen": -0.004120980992036707, + "rewards/margins": 3.9654270536759317, + "rewards/rejected": -3.9695480346679686, + "step": 536 + }, + { + "epoch": 0.9294677628732151, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -23375078.4, + "logits/rejected": -26847497.411764707, + "logps/chosen": -209.49270833333333, + "logps/rejected": -368.5500057444853, + "loss": 0.2421, + "rewards/chosen": 0.19369576772054037, + "rewards/margins": 4.649224924573711, + "rewards/rejected": -4.455529156853171, + "step": 537 + }, + { + "epoch": 0.9311986153180442, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -27466656.0, + "logits/rejected": -19032440.0, + "logps/chosen": -259.3592224121094, + "logps/rejected": -292.443603515625, + "loss": 0.2338, + "rewards/chosen": 0.2841821312904358, + "rewards/margins": 5.2895994782447815, + "rewards/rejected": -5.005417346954346, + "step": 538 + }, + { + "epoch": 0.9329294677628732, + "grad_norm": 20.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15543773.538461538, + "logits/rejected": -15453200.842105264, + "logps/chosen": -238.67347130408655, + "logps/rejected": -215.55407072368422, + "loss": 0.2221, + "rewards/chosen": 0.33618956345778245, + "rewards/margins": 3.9660920517647313, + "rewards/rejected": -3.629902488306949, + "step": 539 + }, + { + "epoch": 0.9346603202077023, + "grad_norm": 18.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20258768.0, + "logits/rejected": -27554187.42857143, + "logps/chosen": -186.64798990885416, + "logps/rejected": -223.52950613839286, + "loss": 0.2983, + "rewards/chosen": 0.41172223620944554, + "rewards/margins": 3.2259186865791443, + "rewards/rejected": -2.814196450369699, + "step": 540 + }, + { + "epoch": 0.9363911726525314, + "grad_norm": 21.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": 1046812.6315789474, + "logits/rejected": -21959047.384615384, + "logps/chosen": -198.05195055509867, + "logps/rejected": -311.23080679086536, + "loss": 0.3067, + "rewards/chosen": -0.03541364167865954, + "rewards/margins": 5.0986829441086, + "rewards/rejected": -5.13409658578726, + "step": 541 + }, + { + "epoch": 0.9381220250973604, + "grad_norm": 18.0, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15734675.555555556, + "logits/rejected": -15638189.714285715, + "logps/chosen": -203.97459581163196, + "logps/rejected": -353.4259556361607, + "loss": 0.2716, + "rewards/chosen": 0.4354752169715034, + "rewards/margins": 4.081089840994941, + "rewards/rejected": -3.6456146240234375, + "step": 542 + }, + { + "epoch": 0.9398528775421895, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16134838.153846154, + "logits/rejected": -17900727.57894737, + "logps/chosen": -230.89562049278845, + "logps/rejected": -330.51416015625, + "loss": 0.209, + "rewards/chosen": 0.283492950292734, + "rewards/margins": 4.6038022379160894, + "rewards/rejected": -4.320309287623355, + "step": 543 + }, + { + "epoch": 0.9415837299870186, + "grad_norm": 19.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11485119.0, + "logits/rejected": -21915080.0, + "logps/chosen": -185.5915985107422, + "logps/rejected": -276.93609619140625, + "loss": 0.2711, + "rewards/chosen": 0.23676279187202454, + "rewards/margins": 3.7928138077259064, + "rewards/rejected": -3.556051015853882, + "step": 544 + }, + { + "epoch": 0.9433145824318477, + "grad_norm": 19.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13709312.0, + "logits/rejected": -18572339.2, + "logps/chosen": -220.9087344898897, + "logps/rejected": -247.22176106770834, + "loss": 0.2735, + "rewards/chosen": 0.2023478676291073, + "rewards/margins": 4.139690956414915, + "rewards/rejected": -3.9373430887858074, + "step": 545 + }, + { + "epoch": 0.9450454348766768, + "grad_norm": 20.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4513937.882352941, + "logits/rejected": -15788234.666666666, + "logps/chosen": -190.31519990808823, + "logps/rejected": -332.00501302083336, + "loss": 0.2667, + "rewards/chosen": 0.26923468533684225, + "rewards/margins": 3.873376429314707, + "rewards/rejected": -3.6041417439778645, + "step": 546 + }, + { + "epoch": 0.9467762873215059, + "grad_norm": 18.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16378344.888888888, + "logits/rejected": -11710401.142857144, + "logps/chosen": -171.14954969618054, + "logps/rejected": -178.15471540178572, + "loss": 0.2757, + "rewards/chosen": 0.31609151098463273, + "rewards/margins": 3.7265210567958773, + "rewards/rejected": -3.4104295458112444, + "step": 547 + }, + { + "epoch": 0.9485071397663349, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -11216434.823529411, + "logits/rejected": -13815668.266666668, + "logps/chosen": -218.66127642463235, + "logps/rejected": -256.41925455729165, + "loss": 0.2716, + "rewards/chosen": 0.11742969120250028, + "rewards/margins": 4.0313660518795835, + "rewards/rejected": -3.9139363606770834, + "step": 548 + }, + { + "epoch": 0.950237992211164, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12267329.523809524, + "logits/rejected": -24378257.454545453, + "logps/chosen": -142.56395321800596, + "logps/rejected": -337.28298117897725, + "loss": 0.319, + "rewards/chosen": 0.21916080656505765, + "rewards/margins": 4.392611623326421, + "rewards/rejected": -4.173450816761363, + "step": 549 + }, + { + "epoch": 0.9519688446559931, + "grad_norm": 20.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -26072582.095238097, + "logits/rejected": -18941517.09090909, + "logps/chosen": -230.01406715029762, + "logps/rejected": -247.8866521661932, + "loss": 0.3268, + "rewards/chosen": 0.19683615366617838, + "rewards/margins": 3.4820738994713984, + "rewards/rejected": -3.28523774580522, + "step": 550 + }, + { + "epoch": 0.9536996971008221, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8528191.05882353, + "logits/rejected": -14387735.466666667, + "logps/chosen": -166.89300896139707, + "logps/rejected": -217.92140299479166, + "loss": 0.2973, + "rewards/chosen": 0.2662179890800925, + "rewards/margins": 3.0142236653496237, + "rewards/rejected": -2.748005676269531, + "step": 551 + }, + { + "epoch": 0.9554305495456512, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15674302.4, + "logits/rejected": -11228305.454545455, + "logps/chosen": -177.0337890625, + "logps/rejected": -348.75341796875, + "loss": 0.1935, + "rewards/chosen": 0.18450405597686767, + "rewards/margins": 4.309050107002259, + "rewards/rejected": -4.124546051025391, + "step": 552 + }, + { + "epoch": 0.9571614019904803, + "grad_norm": 16.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7863583.0, + "logits/rejected": -15133332.0, + "logps/chosen": -188.83261108398438, + "logps/rejected": -199.46438598632812, + "loss": 0.2808, + "rewards/chosen": -0.06876794993877411, + "rewards/margins": 3.6996007710695267, + "rewards/rejected": -3.768368721008301, + "step": 553 + }, + { + "epoch": 0.9588922544353093, + "grad_norm": 15.5625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -28304765.333333332, + "logits/rejected": -19884369.6, + "logps/chosen": -199.59102376302084, + "logps/rejected": -259.8969970703125, + "loss": 0.2252, + "rewards/chosen": 0.39737669626871747, + "rewards/margins": 3.756306139628092, + "rewards/rejected": -3.358929443359375, + "step": 554 + }, + { + "epoch": 0.9606231068801385, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20633886.0, + "logits/rejected": -11431363.0, + "logps/chosen": -240.20606994628906, + "logps/rejected": -209.79103088378906, + "loss": 0.243, + "rewards/chosen": 0.3174148201942444, + "rewards/margins": 4.274515211582184, + "rewards/rejected": -3.9571003913879395, + "step": 555 + }, + { + "epoch": 0.9623539593249676, + "grad_norm": 12.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5010665.846153846, + "logits/rejected": -21859912.42105263, + "logps/chosen": -133.8183875450721, + "logps/rejected": -301.21653988486844, + "loss": 0.1985, + "rewards/chosen": 0.35960832008948695, + "rewards/margins": 4.6711365761544545, + "rewards/rejected": -4.3115282560649675, + "step": 556 + }, + { + "epoch": 0.9640848117697967, + "grad_norm": 17.875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -10758746.352941176, + "logits/rejected": -20769036.8, + "logps/chosen": -197.44325884650735, + "logps/rejected": -283.14983723958335, + "loss": 0.266, + "rewards/chosen": 0.2531372519100414, + "rewards/margins": 4.772798201617072, + "rewards/rejected": -4.519660949707031, + "step": 557 + }, + { + "epoch": 0.9658156642146257, + "grad_norm": 19.25, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -12659816.0, + "logits/rejected": -5504519.0, + "logps/chosen": -189.08380126953125, + "logps/rejected": -303.64056396484375, + "loss": 0.2795, + "rewards/chosen": 0.18087486922740936, + "rewards/margins": 3.6736734360456467, + "rewards/rejected": -3.4927985668182373, + "step": 558 + }, + { + "epoch": 0.9675465166594548, + "grad_norm": 14.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8637268.0, + "logits/rejected": -26252638.0, + "logps/chosen": -138.60134887695312, + "logps/rejected": -328.9735412597656, + "loss": 0.2587, + "rewards/chosen": 0.08841336518526077, + "rewards/margins": 4.641157276928425, + "rewards/rejected": -4.552743911743164, + "step": 559 + }, + { + "epoch": 0.9692773691042839, + "grad_norm": 18.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -22444739.555555556, + "logits/rejected": -11311891.42857143, + "logps/chosen": -172.24004448784723, + "logps/rejected": -243.47108677455358, + "loss": 0.2801, + "rewards/chosen": 0.23756768968370226, + "rewards/margins": 3.6749363551064147, + "rewards/rejected": -3.4373686654227122, + "step": 560 + }, + { + "epoch": 0.9710082215491129, + "grad_norm": 13.8125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14531388.0, + "logits/rejected": -4354661.5, + "logps/chosen": -125.24881744384766, + "logps/rejected": -271.8765563964844, + "loss": 0.2558, + "rewards/chosen": 0.24584539234638214, + "rewards/margins": 4.299230650067329, + "rewards/rejected": -4.053385257720947, + "step": 561 + }, + { + "epoch": 0.972739073993942, + "grad_norm": 17.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -7341733.714285715, + "logits/rejected": -13311976.888888888, + "logps/chosen": -183.40297154017858, + "logps/rejected": -244.354736328125, + "loss": 0.2543, + "rewards/chosen": -0.005461513996124268, + "rewards/margins": 3.40173919333352, + "rewards/rejected": -3.407200707329644, + "step": 562 + }, + { + "epoch": 0.9744699264387711, + "grad_norm": 15.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4685177.5, + "logits/rejected": -15349859.0, + "logps/chosen": -123.40423583984375, + "logps/rejected": -225.739013671875, + "loss": 0.2667, + "rewards/chosen": 0.24755218625068665, + "rewards/margins": 4.066319495439529, + "rewards/rejected": -3.8187673091888428, + "step": 563 + }, + { + "epoch": 0.9762007788836001, + "grad_norm": 11.9375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -4233318.545454546, + "logits/rejected": -19686078.476190478, + "logps/chosen": -132.2185613458807, + "logps/rejected": -239.15048363095238, + "loss": 0.178, + "rewards/chosen": 0.3321400555697354, + "rewards/margins": 4.643141061196595, + "rewards/rejected": -4.31100100562686, + "step": 564 + }, + { + "epoch": 0.9779316313284292, + "grad_norm": 17.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -24299501.714285713, + "logits/rejected": -10966127.111111112, + "logps/chosen": -191.37301199776786, + "logps/rejected": -270.11518012152777, + "loss": 0.2166, + "rewards/chosen": 0.3022442545209612, + "rewards/margins": 4.790505159468879, + "rewards/rejected": -4.488260904947917, + "step": 565 + }, + { + "epoch": 0.9796624837732584, + "grad_norm": 20.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -17696162.0, + "logits/rejected": -18060354.0, + "logps/chosen": -199.65805053710938, + "logps/rejected": -237.23216247558594, + "loss": 0.2689, + "rewards/chosen": 0.22615990042686462, + "rewards/margins": 3.021805375814438, + "rewards/rejected": -2.7956454753875732, + "step": 566 + }, + { + "epoch": 0.9813933362180874, + "grad_norm": 18.0, + "kl": 0.04562568664550781, + "learning_rate": 5e-06, + "logits/chosen": -14762987.294117646, + "logits/rejected": -14447952.0, + "logps/chosen": -203.9204819623162, + "logps/rejected": -234.99597981770833, + "loss": 0.2699, + "rewards/chosen": 0.3928084373474121, + "rewards/margins": 3.710885016123454, + "rewards/rejected": -3.318076578776042, + "step": 567 + }, + { + "epoch": 0.9831241886629165, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -5363778.461538462, + "logits/rejected": -13101363.368421054, + "logps/chosen": -182.1116661658654, + "logps/rejected": -271.40499074835526, + "loss": 0.2111, + "rewards/chosen": 0.33354718868549055, + "rewards/margins": 4.48431767722373, + "rewards/rejected": -4.15077048853824, + "step": 568 + }, + { + "epoch": 0.9848550411077456, + "grad_norm": 14.1875, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -3061828.0, + "logits/rejected": -8448559.111111112, + "logps/chosen": -130.17633928571428, + "logps/rejected": -212.89643012152777, + "loss": 0.2296, + "rewards/chosen": 0.1730896234512329, + "rewards/margins": 3.825902157359653, + "rewards/rejected": -3.65281253390842, + "step": 569 + }, + { + "epoch": 0.9865858935525746, + "grad_norm": 21.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -19105656.888888888, + "logits/rejected": -16878851.42857143, + "logps/chosen": -270.48130967881946, + "logps/rejected": -260.11336844308033, + "loss": 0.2856, + "rewards/chosen": 0.21564624044630262, + "rewards/margins": 3.7514880460406106, + "rewards/rejected": -3.535841805594308, + "step": 570 + }, + { + "epoch": 0.9883167459974037, + "grad_norm": 16.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -21822586.352941178, + "logits/rejected": -4643079.466666667, + "logps/chosen": -202.06014476102942, + "logps/rejected": -213.38444010416666, + "loss": 0.3039, + "rewards/chosen": 0.2961447098675896, + "rewards/margins": 3.843302251778397, + "rewards/rejected": -3.5471575419108072, + "step": 571 + }, + { + "epoch": 0.9900475984422328, + "grad_norm": 18.75, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -8295186.52631579, + "logits/rejected": -511194.26923076925, + "logps/chosen": -182.56332236842104, + "logps/rejected": -161.923828125, + "loss": 0.2759, + "rewards/chosen": 0.31847027728432103, + "rewards/margins": 4.091251942792884, + "rewards/rejected": -3.7727816655085635, + "step": 572 + }, + { + "epoch": 0.9917784508870618, + "grad_norm": 21.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14081197.333333334, + "logits/rejected": -14559613.714285715, + "logps/chosen": -228.43486870659723, + "logps/rejected": -303.82627650669644, + "loss": 0.267, + "rewards/chosen": 0.23746993806627062, + "rewards/margins": 5.186938278258793, + "rewards/rejected": -4.949468340192523, + "step": 573 + }, + { + "epoch": 0.9935093033318909, + "grad_norm": 18.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -16850096.0, + "logits/rejected": -11168912.0, + "logps/chosen": -180.65089416503906, + "logps/rejected": -196.49746704101562, + "loss": 0.2863, + "rewards/chosen": 0.09208297729492188, + "rewards/margins": 3.3805606365203857, + "rewards/rejected": -3.288477659225464, + "step": 574 + }, + { + "epoch": 0.99524015577672, + "grad_norm": 22.125, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -13447976.0, + "logits/rejected": -34808642.666666664, + "logps/chosen": -201.29320068359374, + "logps/rejected": -389.7504069010417, + "loss": 0.2941, + "rewards/chosen": 0.28269662857055666, + "rewards/margins": 4.285934670766195, + "rewards/rejected": -4.003238042195638, + "step": 575 + }, + { + "epoch": 0.9969710082215492, + "grad_norm": 15.625, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -15401051.42857143, + "logits/rejected": -14824715.555555556, + "logps/chosen": -164.81647600446428, + "logps/rejected": -281.62188042534723, + "loss": 0.2427, + "rewards/chosen": 0.25805938243865967, + "rewards/margins": 3.9605329169167414, + "rewards/rejected": -3.7024735344780817, + "step": 576 + }, + { + "epoch": 0.9987018606663782, + "grad_norm": 25.375, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -14908833.391304348, + "logits/rejected": -34570872.88888889, + "logps/chosen": -301.42673658288044, + "logps/rejected": -293.209228515625, + "loss": 0.3005, + "rewards/chosen": 0.3803315784620202, + "rewards/margins": 4.98188178665972, + "rewards/rejected": -4.6015502081976996, + "step": 577 + }, + { + "epoch": 1.0017308524448292, + "grad_norm": 20.5, + "kl": 0.0, + "learning_rate": 5e-06, + "logits/chosen": -20511636.70588235, + "logits/rejected": -19425946.48275862, + "logps/chosen": -240.42333984375, + "logps/rejected": -257.1704775053879, + "loss": 0.284, + "rewards/chosen": 0.4028752270866843, + "rewards/margins": 4.151148879262061, + "rewards/rejected": -3.748273652175377, + "step": 578 + } + ], + "logging_steps": 1, + "max_steps": 578, + "num_input_tokens_seen": 0, + "num_train_epochs": 2, + "save_steps": 289, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}